Пример #1
0
def get_train_test_data():
    '''
    Load training and test set from nltk corpora
    '''
    train_num = 3800
    #Split the data into training and test set
    test_index = [0, 1, 4, 12, 16, 19, 21, 35, 37, 42, 43, 44, 45, 47, 54, 56, 62, 63, 65, 68, 71, 76, 79, 83]
    treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')
    cnf_train = treebank.parsed_sents()[:train_num]
    cnf_test = [treebank.parsed_sents()[i+train_num] for i in test_index]
    #Convert to Chomsky norm form, remove auxiliary labels
    cnf_train = [convert2cnf(t) for t in cnf_train]
    cnf_test = [convert2cnf(t) for t in cnf_test]
    return cnf_train, cnf_test
Пример #2
0
 def __init__(self,
              tokenizer: Optional[TokenizerI] = SimpleTokenizer(),
              detokenizer: Optional[TokenizerI] = TreebankWordDetokenizer(),
              stopwords: LazyCorpusLoader = stopw):
     self.tokenizer = tokenizer
     self.detokenizer = detokenizer
     self.stopwords = stopwords.words(tokenizer.language)
Пример #3
0
    if ctb_in_nltk is None:
        eprint(
            'You should run nltk.download(\'ptb\') to fetch some data first!')
        exit(1)

    ctb_in_nltk = join(ctb_in_nltk, 'corpora')
    ctb_in_nltk = join(ctb_in_nltk, 'ctb')

    print('Converting CTB: removing xml tags...')
    convert(args.ctb, ctb_in_nltk)
    print('Importing to nltk...\n')
    from nltk.corpus import BracketParseCorpusReader, LazyCorpusLoader

    ctb = LazyCorpusLoader('ctb',
                           BracketParseCorpusReader,
                           r'chtb_.*\.fid',
                           tagset='unknown')

    # commented out splits are typically for dependency parsing, e.g. Zhang and Clark 2008
    # training = list(range(1, 815 + 1)) + list(range(1001, 1136 + 1))
    # development = list(range(886, 931 + 1)) + list(range(1148, 1151 + 1))
    # test = list(range(816, 885 + 1)) + list(range(1137, 1147 + 1))

    # splits for constituency parsing, see Petrov and Klein 2007, Liu and Zhang 2017
    training = list(range(1, 270 + 1)) + list(range(440, 1151 + 1))
    development = list(range(301, 325 + 1))
    test = list(range(271, 300 + 1))

    # make sure there's no overlap
    assert not (set(training) & set(development))
    assert not (set(training) & set(test))
Пример #4
0
    return output


# def spell_word(word):
#     dic = enchant.Dict('pt_BR')
#     output = word
#     if len(word) > 0 and (not dic.check(word)):
#         sugestoes = dic.suggest(word)
#         if len(sugestoes) > 0:
#             output = sugestoes[0]
#     return output


## Inicio do Treinamento
catho_treinamento = LazyCorpusLoader(
    'catho_treinamento', CategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'(negativo|positivo|neutro)/.*')

print "Preparando documentos para treinamento..."
sys.stdout.flush()
documents_treinamento = [(list(catho_treinamento.words(fileid)), category)
                         for category in catho_treinamento.categories()
                         for fileid in catho_treinamento.fileids(category)]
print "fim da preparacao dos documentos de treinamento."
sys.stdout.flush()
## Pre-processamento

corpus_words = [w.lower()
                for w in catho_treinamento.words()
                if w not in string.punctuation]
                #if w not in string.punctuation and
Пример #5
0
        return tree


def tree_to_production(tree):
    return ProbabilisticProduction(get_tag(tree), [get_tag(child) for child in tree], **{'prob': 0})


def tree_to_productions(tree):
    yield tree_to_production(tree)
    for child in tree:
        if isinstance(child, Tree):
            for prod in tree_to_productions(child):
                yield prod


treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')


def get_productions(productions):

    probabilities = dict()
    productions_to_return = list(set(productions))

    for prod in productions:
        if str(prod) in probabilities:
            probabilities[str(prod)] += 1
        else:
            probabilities[str(prod)] = 1

    amount_of_interior_nodes = len([prod.lhs() for prod in productions if prod.lhs() != Nonterminal('S')])
Пример #6
0
import nltk
from nltk.corpus import LazyCorpusLoader, CategorizedPlaintextCorpusReader
from DataCleaning.POSTagger import tagger
from nltk.corpus.reader import sentiwordnet
movie_reviews123 = LazyCorpusLoader('movie_reviews123',
                                    CategorizedPlaintextCorpusReader,
                                    r'(?!\.).*\.txt',
                                    cat_pattern=r'(neg|pos|neutral)/.*',
                                    encoding='ascii')

print(movie_reviews123.categories())

print(nltk.tag._pos_tag())
import features
import random
import nltk
from nltk.corpus import LazyCorpusLoader, stopwords
from nltk.corpus.reader.plaintext import (
    CategorizedPlaintextCorpusReader,
)

decisions = LazyCorpusLoader(
    'tweets_publish_choice',
    CategorizedPlaintextCorpusReader,
    r'.*\.txt',
    cat_pattern=r'(\w+)/*',
    encoding='utf8',
)

# returns all non-stop words from corpus
def get_top_words():
    all_words = nltk.FreqDist(
        w.lower() for w in decisions.words()
        if not w.lower() in stopwords.words('english')
    )
    word_features = list(all_words)[:500]
    nltk.FreqDist.pprint(all_words, 500)
    return word_features


print(decisions.categories())
documents = [(list(decisions.words(fileid)), category)
             for category in decisions.categories()
             for fileid in decisions.fileids(category)]
Пример #8
0
    x_axis.sort()
    y_axis = [ACCURACY_PER_DISTANCE[x]['matches']/float(ACCURACY_PER_DISTANCE[x]['total']) for x in x_axis]
    x_axis_labeled = ACCURACY_PER_DISTANCE_LABELED.keys()
    x_axis_labeled.sort()
    y_axis_labeled = [ACCURACY_PER_DISTANCE_LABELED[x]['matches']/float(ACCURACY_PER_DISTANCE_LABELED[x]['total']) for x in x_axis_labeled]
    print x_axis
    print y_axis
    import matplotlib.pyplot as plt
    plt.title("Accuracy per distance")
    plt.scatter(x_axis, y_axis, c="blue", marker='*', label="accuracy index")
    plt.scatter(x_axis_labeled, y_axis_labeled, c="red", marker='o', label="accuracy label", alpha=0.5)
    plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)
    plt.show()

if __name__ == '__main__':
    treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')
    trees = treebank.parsed_sents()

    trees = trees[:5]
    cleaned_trees = [filter_tree(tree) for tree in trees]
    for t in cleaned_trees:
        chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^')

    parser, pcfg = get_parser(cleaned_trees)
    eval_trees(cleaned_trees, parser, pcfg)

    print "----------- Reporting Per Label -----------"
    print ACCURACY_PER_LABEL
    print len(ACCURACY_PER_LABEL)
    for item in ACCURACY_PER_LABEL:
        print item, "--- total -------> ", ACCURACY_PER_LABEL[item]['total']
Пример #9
0
import nltk.classify.util, nltk.metrics
from nltk.classify import DecisionTreeClassifier
from nltk.corpus import LazyCorpusLoader, CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

stopset = set(stopwords.words('english'))

app_reviews = LazyCorpusLoader('app_reviews',
                               CategorizedPlaintextCorpusReader,
                               r'(?!\.).*\.txt',
                               cat_pattern=r'(neg|pos)/.*',
                               encoding='ascii')


def evaluate_classifier(featureX):

    negIds = app_reviews.fileids('neg')
    posIds = app_reviews.fileids('pos')

    negFeatures = [(featureX(app_reviews.words(fileids=[f])), 'neg')
                   for f in negIds]
    posFeatures = [(featureX(app_reviews.words(fileids=[f])), 'pos')
                   for f in posIds]

    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
import nltk
nltk.download()
from nltk.book import *
print text1

from nltk.text import Text
from nltk.corpus import LazyCorpusLoader, PlaintextCorpusReader
mytest = LazyCorpusLoader('mytest', PlaintextCorpusReader, r'(?!\.).*\.txt')
tresh = Text(mytest.words('tresh.txt'))
tresh.collocations()
tresh.concordance('esto')
Пример #11
0
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

account_id = sys.argv[1]
version = sys.argv[2]
limit = int(sys.argv[3])

decisions = LazyCorpusLoader(
    'tweets_publish_choice_{account}{s}{version}'.format(
        account=account_id,
        version=version,
        s=os.sep,
    ),
    CategorizedPlaintextCorpusReader,
    r'.*\.txt',
    cat_pattern=r'(\w+)/*',
    encoding='utf8',
)

home = os.path.expanduser("~")
path = os.path.join(
    home,
    'nltk_data{s}corpora{s}tweets_publish_choice_{account}{s}{version}'.format(
        account=account_id,
        version=version,
        s=os.sep,
    ))
Пример #12
0
from nltk.corpus import LazyCorpusLoader, BracketParseCorpusReader
from nltk.grammar import Production
from nltk import Tree, Nonterminal

treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')

def simplify_functional_tag(tag):
    if '-' in tag:
        tag = tag.split('-')[0]
    return tag

def get_tag(tree):
    if isinstance(tree, Tree):
        if(tree.label() == '-NONE-') : # get rid of NONE tags
            return Nonterminal(tree[0].upper())
        return Nonterminal(simplify_functional_tag(tree.label()))
    else:
        return tree

def tree_to_production(tree):
    return Production(get_tag(tree), [get_tag(child) for child in tree])

def tree_to_productions(tree):
    yield tree_to_production(tree)
    for child in tree:
        if isinstance(child, Tree):
            for prod in tree_to_productions(child):
                yield prod

def trees_to_productions(trees):
    productions = []
Пример #13
0

from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk import FreqDist, BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk.classify.naivebayes import NaiveBayesClassifier
from logger import  logger
from nltk.corpus import LazyCorpusLoader
from nltk.text import TextCollection
import nltk, random, operator, itertools

my_corpus = LazyCorpusLoader(
    'my_corpus', CategorizedPlaintextCorpusReader, '(data).*',
    cat_file='cats.txt')
stopwords = nltk.corpus.stopwords.words()

all_words = FreqDist(w.lower() for w in my_corpus.words() if w not in stopwords and len(w) > 2)
#all_words_inf = {}
#textCollection = TextCollection(my_corpus)
#for word in all_words.keys()[:1000]:
#    score = 0
#    for fileid in my_corpus.fileids():
#        text = my_corpus.raw(fileid)
#        score += textCollection.tf_idf(word, text)
#    all_words_inf[word] = score
#all_words = sorted(all_words_inf.items(), key=operator.itemgetter(1), reverse=False)
word_features = [word for word in all_words.keys() if len(word) > 2][:2000]


def document_features(words_in_document, score_fn=TrigramAssocMeasures.chi_sq, n=300):
    trigram_finder = TrigramCollocationFinder.from_words(words_in_document)
    trigrams = trigram_finder.nbest(score_fn, n)
Пример #14
0
# -*- coding: utf-8 -*-
"""
Created on Mon May  8 17:25:27 2017
This one is to use nltk default corpus loader to load our annotations
@author: Janaka
"""

from nltk.corpus import LazyCorpusLoader, ConllChunkCorpusReader

SOURCE_DIR = '../boi_pos_data/'

boi_pos_data = LazyCorpusLoader(SOURCE_DIR,
                                ConllChunkCorpusReader,
                                [SOURCE_DIR + 'feedback_cs2012_2-result.txt'],
                                ('T'),
                                tagset='wsj',
                                encoding='ascii')

test_sents = boi_pos_data.chunked_sents('test.txt', chunk_types=['T'])
print(test_sents)
Пример #15
0
            for child in elt:
                cc = self.handle_word(child)
                if child.tag == 'instance':  # id, lemma, pos, tkn
                    inst.append((cc[:3]))  # id, lemma, pos
                    sent.append(cc[1:])  # lemma, pos, tkn
                else:
                    sent.append(cc)  # lemma, pos, tkn

            return inst, sent

        return [self.handle_word(child) for child in elt]

    def handle_word(self, elt):
        tkn = elt.text
        if not tkn:
            tkn = ""

        lemma = elt.get('lemma', tkn)
        pos = elt.get('pos')

        if elt.tag == 'instance':
            id = elt.get('id')
            if self._unit in ['instance', 'both']:
                return id, lemma, pos, tkn

        return lemma, pos, tkn


new_semcor = LazyCorpusLoader('new_semcor', SemCorReader,
                              r'(.*\.xml)|(.*\.gold\.key\.txt)', wordnet)
Пример #16
0
    for root in nltk.data.path:
        if isdir(root):
            ctb_in_nltk = root

    if ctb_in_nltk is None:
        eprint(
            'You should run nltk.download(\'ptb\') to fetch some data first!')
        exit(1)

    ctb_in_nltk = join(ctb_in_nltk, 'corpora')
    ctb_in_nltk = join(ctb_in_nltk, 'ctb')

    print('Converting CTB: removing xml tags...')
    convert(args.ctb, ctb_in_nltk)
    print('Importing to nltk...\n')
    from nltk.corpus import BracketParseCorpusReader, LazyCorpusLoader

    ctb = LazyCorpusLoader('ctb',
                           BracketParseCorpusReader,
                           r'chtb_\d{4}\.\w{2}',
                           tagset='unknown')

    training = list(range(1, 815 + 1)) + list(range(1001, 1136 + 1))
    development = list(range(886, 931 + 1)) + list(range(1148, 1151 + 1))
    test = list(range(816, 885 + 1)) + list(range(1137, 1147 + 1))

    root_path = args.output
    combine_fids(training, join(root_path, 'train.txt'))
    combine_fids(development, join(root_path, 'dev.txt'))
    combine_fids(test, join(root_path, 'test.txt'))