Python LazyCorpusLoader 예제들, nltk.corpus.LazyCorpusLoader Python 예제들

예제 #1

0

파일 보기

def get_train_test_data():
    '''
    Load training and test set from nltk corpora
    '''
    train_num = 3800
    #Split the data into training and test set
    test_index = [0, 1, 4, 12, 16, 19, 21, 35, 37, 42, 43, 44, 45, 47, 54, 56, 62, 63, 65, 68, 71, 76, 79, 83]
    treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')
    cnf_train = treebank.parsed_sents()[:train_num]
    cnf_test = [treebank.parsed_sents()[i+train_num] for i in test_index]
    #Convert to Chomsky norm form, remove auxiliary labels
    cnf_train = [convert2cnf(t) for t in cnf_train]
    cnf_test = [convert2cnf(t) for t in cnf_test]
    return cnf_train, cnf_test

예제 #2

0

파일 보기

 def __init__(self,
              tokenizer: Optional[TokenizerI] = SimpleTokenizer(),
              detokenizer: Optional[TokenizerI] = TreebankWordDetokenizer(),
              stopwords: LazyCorpusLoader = stopw):
     self.tokenizer = tokenizer
     self.detokenizer = detokenizer
     self.stopwords = stopwords.words(tokenizer.language)

예제 #3

0

파일 보기

    if ctb_in_nltk is None:
        eprint(
            'You should run nltk.download(\'ptb\') to fetch some data first!')
        exit(1)

    ctb_in_nltk = join(ctb_in_nltk, 'corpora')
    ctb_in_nltk = join(ctb_in_nltk, 'ctb')

    print('Converting CTB: removing xml tags...')
    convert(args.ctb, ctb_in_nltk)
    print('Importing to nltk...\n')
    from nltk.corpus import BracketParseCorpusReader, LazyCorpusLoader

    ctb = LazyCorpusLoader('ctb',
                           BracketParseCorpusReader,
                           r'chtb_.*\.fid',
                           tagset='unknown')

    # commented out splits are typically for dependency parsing, e.g. Zhang and Clark 2008
    # training = list(range(1, 815 + 1)) + list(range(1001, 1136 + 1))
    # development = list(range(886, 931 + 1)) + list(range(1148, 1151 + 1))
    # test = list(range(816, 885 + 1)) + list(range(1137, 1147 + 1))

    # splits for constituency parsing, see Petrov and Klein 2007, Liu and Zhang 2017
    training = list(range(1, 270 + 1)) + list(range(440, 1151 + 1))
    development = list(range(301, 325 + 1))
    test = list(range(271, 300 + 1))

    # make sure there's no overlap
    assert not (set(training) & set(development))
    assert not (set(training) & set(test))

예제 #4

0

파일 보기

파일: classificador.py 프로젝트: drudi/sentimentAnalysis

    return output


# def spell_word(word):
#     dic = enchant.Dict('pt_BR')
#     output = word
#     if len(word) > 0 and (not dic.check(word)):
#         sugestoes = dic.suggest(word)
#         if len(sugestoes) > 0:
#             output = sugestoes[0]
#     return output


## Inicio do Treinamento
catho_treinamento = LazyCorpusLoader(
    'catho_treinamento', CategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'(negativo|positivo|neutro)/.*')

print "Preparando documentos para treinamento..."
sys.stdout.flush()
documents_treinamento = [(list(catho_treinamento.words(fileid)), category)
                         for category in catho_treinamento.categories()
                         for fileid in catho_treinamento.fileids(category)]
print "fim da preparacao dos documentos de treinamento."
sys.stdout.flush()
## Pre-processamento

corpus_words = [w.lower()
                for w in catho_treinamento.words()
                if w not in string.punctuation]
                #if w not in string.punctuation and

예제 #5

0

파일 보기

파일: task2_2_3.py 프로젝트: AdiKrasin/nlp_task3

        return tree


def tree_to_production(tree):
    return ProbabilisticProduction(get_tag(tree), [get_tag(child) for child in tree], **{'prob': 0})


def tree_to_productions(tree):
    yield tree_to_production(tree)
    for child in tree:
        if isinstance(child, Tree):
            for prod in tree_to_productions(child):
                yield prod


treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')


def get_productions(productions):

    probabilities = dict()
    productions_to_return = list(set(productions))

    for prod in productions:
        if str(prod) in probabilities:
            probabilities[str(prod)] += 1
        else:
            probabilities[str(prod)] = 1

    amount_of_interior_nodes = len([prod.lhs() for prod in productions if prod.lhs() != Nonterminal('S')])

예제 #6

0

파일 보기

파일: test.py 프로젝트: raiprabh/analytics

import nltk
from nltk.corpus import LazyCorpusLoader, CategorizedPlaintextCorpusReader
from DataCleaning.POSTagger import tagger
from nltk.corpus.reader import sentiwordnet
movie_reviews123 = LazyCorpusLoader('movie_reviews123',
                                    CategorizedPlaintextCorpusReader,
                                    r'(?!\.).*\.txt',
                                    cat_pattern=r'(neg|pos|neutral)/.*',
                                    encoding='ascii')

print(movie_reviews123.categories())

print(nltk.tag._pos_tag())

예제 #7

0

파일 보기

파일: decision_trees.py 프로젝트: galapijiu/opinions-classifier

import features
import random
import nltk
from nltk.corpus import LazyCorpusLoader, stopwords
from nltk.corpus.reader.plaintext import (
    CategorizedPlaintextCorpusReader,
)

decisions = LazyCorpusLoader(
    'tweets_publish_choice',
    CategorizedPlaintextCorpusReader,
    r'.*\.txt',
    cat_pattern=r'(\w+)/*',
    encoding='utf8',
)

# returns all non-stop words from corpus
def get_top_words():
    all_words = nltk.FreqDist(
        w.lower() for w in decisions.words()
        if not w.lower() in stopwords.words('english')
    )
    word_features = list(all_words)[:500]
    nltk.FreqDist.pprint(all_words, 500)
    return word_features


print(decisions.categories())
documents = [(list(decisions.words(fileid)), category)
             for category in decisions.categories()
             for fileid in decisions.fileids(category)]

예제 #8

0

파일 보기

파일: q2_mterics.py 프로젝트: jedimonster/nlp

    x_axis.sort()
    y_axis = [ACCURACY_PER_DISTANCE[x]['matches']/float(ACCURACY_PER_DISTANCE[x]['total']) for x in x_axis]
    x_axis_labeled = ACCURACY_PER_DISTANCE_LABELED.keys()
    x_axis_labeled.sort()
    y_axis_labeled = [ACCURACY_PER_DISTANCE_LABELED[x]['matches']/float(ACCURACY_PER_DISTANCE_LABELED[x]['total']) for x in x_axis_labeled]
    print x_axis
    print y_axis
    import matplotlib.pyplot as plt
    plt.title("Accuracy per distance")
    plt.scatter(x_axis, y_axis, c="blue", marker='*', label="accuracy index")
    plt.scatter(x_axis_labeled, y_axis_labeled, c="red", marker='o', label="accuracy label", alpha=0.5)
    plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)
    plt.show()

if __name__ == '__main__':
    treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')
    trees = treebank.parsed_sents()

    trees = trees[:5]
    cleaned_trees = [filter_tree(tree) for tree in trees]
    for t in cleaned_trees:
        chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^')

    parser, pcfg = get_parser(cleaned_trees)
    eval_trees(cleaned_trees, parser, pcfg)

    print "----------- Reporting Per Label -----------"
    print ACCURACY_PER_LABEL
    print len(ACCURACY_PER_LABEL)
    for item in ACCURACY_PER_LABEL:
        print item, "--- total -------> ", ACCURACY_PER_LABEL[item]['total']

예제 #9

0

파일 보기

import nltk.classify.util, nltk.metrics
from nltk.classify import DecisionTreeClassifier
from nltk.corpus import LazyCorpusLoader, CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

stopset = set(stopwords.words('english'))

app_reviews = LazyCorpusLoader('app_reviews',
                               CategorizedPlaintextCorpusReader,
                               r'(?!\.).*\.txt',
                               cat_pattern=r'(neg|pos)/.*',
                               encoding='ascii')


def evaluate_classifier(featureX):

    negIds = app_reviews.fileids('neg')
    posIds = app_reviews.fileids('pos')

    negFeatures = [(featureX(app_reviews.words(fileids=[f])), 'neg')
                   for f in negIds]
    posFeatures = [(featureX(app_reviews.words(fileids=[f])), 'pos')
                   for f in posIds]

    #selects 3/4 of the features to be used for training and 1/4 to be used for testing

예제 #10

0

파일 보기

파일: Language_Processing_and_Python.py 프로젝트: manugallardo/nlp

import nltk
nltk.download()
from nltk.book import *
print text1

from nltk.text import Text
from nltk.corpus import LazyCorpusLoader, PlaintextCorpusReader
mytest = LazyCorpusLoader('mytest', PlaintextCorpusReader, r'(?!\.).*\.txt')
tresh = Text(mytest.words('tresh.txt'))
tresh.collocations()
tresh.concordance('esto')

예제 #11

0

파일 보기

파일: svm.py 프로젝트: galapijiu/opinions-classifier

from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

account_id = sys.argv[1]
version = sys.argv[2]
limit = int(sys.argv[3])

decisions = LazyCorpusLoader(
    'tweets_publish_choice_{account}{s}{version}'.format(
        account=account_id,
        version=version,
        s=os.sep,
    ),
    CategorizedPlaintextCorpusReader,
    r'.*\.txt',
    cat_pattern=r'(\w+)/*',
    encoding='utf8',
)

home = os.path.expanduser("~")
path = os.path.join(
    home,
    'nltk_data{s}corpora{s}tweets_publish_choice_{account}{s}{version}'.format(
        account=account_id,
        version=version,
        s=os.sep,
    ))

예제 #12

0

파일 보기

파일: Q1.2.py 프로젝트: dianagastrin/NLP

from nltk.corpus import LazyCorpusLoader, BracketParseCorpusReader
from nltk.grammar import Production
from nltk import Tree, Nonterminal

treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')

def simplify_functional_tag(tag):
    if '-' in tag:
        tag = tag.split('-')[0]
    return tag

def get_tag(tree):
    if isinstance(tree, Tree):
        if(tree.label() == '-NONE-') : # get rid of NONE tags
            return Nonterminal(tree[0].upper())
        return Nonterminal(simplify_functional_tag(tree.label()))
    else:
        return tree

def tree_to_production(tree):
    return Production(get_tag(tree), [get_tag(child) for child in tree])

def tree_to_productions(tree):
    yield tree_to_production(tree)
    for child in tree:
        if isinstance(child, Tree):
            for prod in tree_to_productions(child):
                yield prod

def trees_to_productions(trees):
    productions = []

예제 #13

0

파일 보기

파일: nbc.py 프로젝트: soldierkam/pynews


from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk import FreqDist, BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk.classify.naivebayes import NaiveBayesClassifier
from logger import  logger
from nltk.corpus import LazyCorpusLoader
from nltk.text import TextCollection
import nltk, random, operator, itertools

my_corpus = LazyCorpusLoader(
    'my_corpus', CategorizedPlaintextCorpusReader, '(data).*',
    cat_file='cats.txt')
stopwords = nltk.corpus.stopwords.words()

all_words = FreqDist(w.lower() for w in my_corpus.words() if w not in stopwords and len(w) > 2)
#all_words_inf = {}
#textCollection = TextCollection(my_corpus)
#for word in all_words.keys()[:1000]:
#    score = 0
#    for fileid in my_corpus.fileids():
#        text = my_corpus.raw(fileid)
#        score += textCollection.tf_idf(word, text)
#    all_words_inf[word] = score
#all_words = sorted(all_words_inf.items(), key=operator.itemgetter(1), reverse=False)
word_features = [word for word in all_words.keys() if len(word) > 2][:2000]


def document_features(words_in_document, score_fn=TrigramAssocMeasures.chi_sq, n=300):
    trigram_finder = TrigramCollocationFinder.from_words(words_in_document)
    trigrams = trigram_finder.nbest(score_fn, n)

예제 #14

0

파일 보기

파일: corpus.py 프로젝트: Crystal-Solutions/fyp_scritps

# -*- coding: utf-8 -*-
"""
Created on Mon May  8 17:25:27 2017
This one is to use nltk default corpus loader to load our annotations
@author: Janaka
"""

from nltk.corpus import LazyCorpusLoader, ConllChunkCorpusReader

SOURCE_DIR = '../boi_pos_data/'

boi_pos_data = LazyCorpusLoader(SOURCE_DIR,
                                ConllChunkCorpusReader,
                                [SOURCE_DIR + 'feedback_cs2012_2-result.txt'],
                                ('T'),
                                tagset='wsj',
                                encoding='ascii')

test_sents = boi_pos_data.chunked_sents('test.txt', chunk_types=['T'])
print(test_sents)

예제 #15

0

파일 보기

            for child in elt:
                cc = self.handle_word(child)
                if child.tag == 'instance':  # id, lemma, pos, tkn
                    inst.append((cc[:3]))  # id, lemma, pos
                    sent.append(cc[1:])  # lemma, pos, tkn
                else:
                    sent.append(cc)  # lemma, pos, tkn

            return inst, sent

        return [self.handle_word(child) for child in elt]

    def handle_word(self, elt):
        tkn = elt.text
        if not tkn:
            tkn = ""

        lemma = elt.get('lemma', tkn)
        pos = elt.get('pos')

        if elt.tag == 'instance':
            id = elt.get('id')
            if self._unit in ['instance', 'both']:
                return id, lemma, pos, tkn

        return lemma, pos, tkn


new_semcor = LazyCorpusLoader('new_semcor', SemCorReader,
                              r'(.*\.xml)|(.*\.gold\.key\.txt)', wordnet)

예제 #16

0

파일 보기

    for root in nltk.data.path:
        if isdir(root):
            ctb_in_nltk = root

    if ctb_in_nltk is None:
        eprint(
            'You should run nltk.download(\'ptb\') to fetch some data first!')
        exit(1)

    ctb_in_nltk = join(ctb_in_nltk, 'corpora')
    ctb_in_nltk = join(ctb_in_nltk, 'ctb')

    print('Converting CTB: removing xml tags...')
    convert(args.ctb, ctb_in_nltk)
    print('Importing to nltk...\n')
    from nltk.corpus import BracketParseCorpusReader, LazyCorpusLoader

    ctb = LazyCorpusLoader('ctb',
                           BracketParseCorpusReader,
                           r'chtb_\d{4}\.\w{2}',
                           tagset='unknown')

    training = list(range(1, 815 + 1)) + list(range(1001, 1136 + 1))
    development = list(range(886, 931 + 1)) + list(range(1148, 1151 + 1))
    test = list(range(816, 885 + 1)) + list(range(1137, 1147 + 1))

    root_path = args.output
    combine_fids(training, join(root_path, 'train.txt'))
    combine_fids(development, join(root_path, 'dev.txt'))
    combine_fids(test, join(root_path, 'test.txt'))