Exemplos de Corpus em Python, exemplos de quanteda.Corpus em Python

Exemplo n.º 1

0

Exibir arquivo

def get_docs_names(path):
    manifs = quanteda.Corpus()
    for f in os.listdir(path):
        print f
        text = open(path + f).read()
        text = unicode(text, 'utf-8')
        bits = f.split('_')
        country = bits[0]
        level = bits[1]
        year = bits[2]
        lang = bits[3]
        party = bits[4].replace('.txt', '')
        d = quanteda.Document(text, fname=f, variables={"year":year, "country":country.upper(),\
         "party":party, "lang":lang, "level":level})
        d.preprocess()
        manifs.add_docs(d)
    return manifs

Exemplo n.º 2

0

Exibir arquivo

def get_docs_folders(path):
    manifs = quanteda.Corpus()
    for ctrcode in os.listdir(path):
        print ctrcode
        for year in os.listdir(path + ctrcode):
            for manif in os.listdir(path + ctrcode + '/' + year):
                text = open(path + ctrcode + '/' + year + '/' + manif).read()
                res = chardet.detect(text)
                text = text.decode(res['encoding'])
                party = manif.split('_')[0]
                d = quanteda.Document(text,
                                      fname=manif,
                                      variables={
                                          "year": year,
                                          "country": ctrcode,
                                          "party": party
                                      })
                d.preprocess()
                manifs.add_docs(d)
    return manifs

Exemplo n.º 3

0

Exibir arquivo

import gensim
import string
import nltk
import quanteda
import codecs
import sys
import random

path = "/home/paul/Dropbox/populism/"

neg_path = '~/Dropbox/QUANTESS/corpora/movieReviews/smaller/neg/'
neg_path = os.path.expanduser(neg_path)  # get machine independent path
pos_path = '~/Dropbox/QUANTESS/corpora/movieReviews/smaller/pos/'
pos_path = os.path.expanduser(pos_path)  # get machine independent path

movies = quanteda.Corpus()  # a Corpus has a list of documents

# add and label the negative reviews
negs = movies.read_docs(neg_path, {"sent": "neg"})
movies.add_docs(negs)
# add and label the postive reviews
pos = movies.read_docs(pos_path, {"sent": "pos"})
print movies
movies.add_docs(pos)
movies.preprocess()
random.shuffle(movies.documents)

texts = []
for m in movies.documents:
    words = m.text.split()
    texts.append(words)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: runTopicModel.py Projeto: pnulty/LSEtext

        text = '\n'.join(lines[1:])
        new_docs.append(quanteda.Document(text, filename, atts))
    return new_docs


def read_docs(di):
    docs = []
    file_list = [join(di, f) for f in listdir(di) if isfile(join(di, f))]
    for f in file_list:
        docs.append([codecs.open(f, encoding='utf-8').readlines(), f])
    return docs


inpath = "/home/paul/Dropbox/LSETextMining/code/articles"
docs = read_docs(inpath)
news_corpus = quanteda.Corpus()
temp = make_docs(docs)
news_corpus.documents.extend(temp)
news_corpus.preprocess()

texts = []
stopfile = "/home/paul/Dropbox/LSETextMining/code/stopwords.txt"
stopwords = [
    s.strip() for s in codecs.open(stopfile, encoding='utf-8').readlines()
]
for m in news_corpus.documents:
    words = m.text.split()
    words = filter(lambda word: word not in stopwords, words)
    texts.append(words)

dictionary = corpora.Dictionary(texts)

Exemplo n.º 5

0

Exibir arquivo

import nltk
import os
import quanteda
import random
import zipfile
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

leftParties = [
    "Laba", "Lab", "Lib", "Comm", "LibSDP", "SF", "SEP", "TW", "Gr", "Resp"
]

ukMan = quanteda.Corpus()
with zipfile.ZipFile("/home/paul/UK_Manifestos.zip") as myzip:
    for n in myzip.namelist():
        d = quanteda.Document(myzip.open(n).read(), n)
        ukMan.documents.append(d)
        n = n.replace('Con_a', 'Cona')
        n = n.replace('Lab_a', 'Laba')
        n = n.replace('.txt', '')
        v = n.split('_')
        wing = "None"
        if v[4] in leftParties: wing = "Left"
        else: wing = "Right"
        d.add_variables({
            "elecType": v[1],
            "year": v[2],
            "lang": v[3],
            "party": v[4],
            "wing": wing

Exemplo n.º 6

0

Exibir arquivo

    """ Kohei's dictreading function"""
    dictionary = {}
    f = codecs.open(path, 'r', 'utf-8-sig')
    lines = f.readlines()
    f.close()
    for line in lines:
        if line[0] != '#' and len(line.strip()):
            line = line.replace(';', ',')
            label = line.strip().split(':')[0].split(',')
            words = line.strip().split(':')[1].split(',')
            words = [unicode(w.strip()) for w in words]
            dictionary[label[2]] = words
    return (dictionary)


manifs = quanteda.Corpus()

path = "/home/paul/Dropbox/QUANTESS/corpora/UK Manifestos/"

files = os.listdir(path)
for fname in files:
    f = open(path + fname, 'r')
    text = f.read()
    text = text.decode('latin1')
    temp = fname.split('_')
    country = temp[0]
    year = temp[2]
    party = temp[4].replace('.txt', '')
    d = quanteda.Document(text,
                          fname=fname,
                          variables={

Exemplo n.º 7

0

Exibir arquivo

import nltk
import quanteda
import codecs
import sys
import random
import numpy as np


path="/home/paul/Dropbox/populism/"

neg_path = '~/Dropbox/QUANTESS/corpora/movieReviews/smaller/neg/'
neg_path = os.path.expanduser(neg_path)		# get machine independent path
pos_path = '~/Dropbox/QUANTESS/corpora/movieReviews/smaller/pos/'
pos_path = os.path.expanduser(pos_path)		# get machine independent path

movies=quanteda.Corpus()

# add and label the negative reviews
negs = movies.read_docs(neg_path, {"sent":"neg"})
movies.add_docs(*negs)
# add and label the postive reviews
pos = movies.read_docs(pos_path, {"sent":"pos"})
movies.add_docs(*pos)
movies.preprocess()
random.shuffle(movies.documents)
print("1")
movies.make_fdist()
print("2")


dfm = np.zeros((len(movies.documents), len(movies.vocab)))