def get_docs_names(path): manifs = quanteda.Corpus() for f in os.listdir(path): print f text = open(path + f).read() text = unicode(text, 'utf-8') bits = f.split('_') country = bits[0] level = bits[1] year = bits[2] lang = bits[3] party = bits[4].replace('.txt', '') d = quanteda.Document(text, fname=f, variables={"year":year, "country":country.upper(),\ "party":party, "lang":lang, "level":level}) d.preprocess() manifs.add_docs(d) return manifs
def get_docs_folders(path): manifs = quanteda.Corpus() for ctrcode in os.listdir(path): print ctrcode for year in os.listdir(path + ctrcode): for manif in os.listdir(path + ctrcode + '/' + year): text = open(path + ctrcode + '/' + year + '/' + manif).read() res = chardet.detect(text) text = text.decode(res['encoding']) party = manif.split('_')[0] d = quanteda.Document(text, fname=manif, variables={ "year": year, "country": ctrcode, "party": party }) d.preprocess() manifs.add_docs(d) return manifs
import gensim import string import nltk import quanteda import codecs import sys import random path = "/home/paul/Dropbox/populism/" neg_path = '~/Dropbox/QUANTESS/corpora/movieReviews/smaller/neg/' neg_path = os.path.expanduser(neg_path) # get machine independent path pos_path = '~/Dropbox/QUANTESS/corpora/movieReviews/smaller/pos/' pos_path = os.path.expanduser(pos_path) # get machine independent path movies = quanteda.Corpus() # a Corpus has a list of documents # add and label the negative reviews negs = movies.read_docs(neg_path, {"sent": "neg"}) movies.add_docs(negs) # add and label the postive reviews pos = movies.read_docs(pos_path, {"sent": "pos"}) print movies movies.add_docs(pos) movies.preprocess() random.shuffle(movies.documents) texts = [] for m in movies.documents: words = m.text.split() texts.append(words)
text = '\n'.join(lines[1:]) new_docs.append(quanteda.Document(text, filename, atts)) return new_docs def read_docs(di): docs = [] file_list = [join(di, f) for f in listdir(di) if isfile(join(di, f))] for f in file_list: docs.append([codecs.open(f, encoding='utf-8').readlines(), f]) return docs inpath = "/home/paul/Dropbox/LSETextMining/code/articles" docs = read_docs(inpath) news_corpus = quanteda.Corpus() temp = make_docs(docs) news_corpus.documents.extend(temp) news_corpus.preprocess() texts = [] stopfile = "/home/paul/Dropbox/LSETextMining/code/stopwords.txt" stopwords = [ s.strip() for s in codecs.open(stopfile, encoding='utf-8').readlines() ] for m in news_corpus.documents: words = m.text.split() words = filter(lambda word: word not in stopwords, words) texts.append(words) dictionary = corpora.Dictionary(texts)
import nltk import os import quanteda import random import zipfile from nltk.classify import SklearnClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC leftParties = [ "Laba", "Lab", "Lib", "Comm", "LibSDP", "SF", "SEP", "TW", "Gr", "Resp" ] ukMan = quanteda.Corpus() with zipfile.ZipFile("/home/paul/UK_Manifestos.zip") as myzip: for n in myzip.namelist(): d = quanteda.Document(myzip.open(n).read(), n) ukMan.documents.append(d) n = n.replace('Con_a', 'Cona') n = n.replace('Lab_a', 'Laba') n = n.replace('.txt', '') v = n.split('_') wing = "None" if v[4] in leftParties: wing = "Left" else: wing = "Right" d.add_variables({ "elecType": v[1], "year": v[2], "lang": v[3], "party": v[4], "wing": wing
""" Kohei's dictreading function""" dictionary = {} f = codecs.open(path, 'r', 'utf-8-sig') lines = f.readlines() f.close() for line in lines: if line[0] != '#' and len(line.strip()): line = line.replace(';', ',') label = line.strip().split(':')[0].split(',') words = line.strip().split(':')[1].split(',') words = [unicode(w.strip()) for w in words] dictionary[label[2]] = words return (dictionary) manifs = quanteda.Corpus() path = "/home/paul/Dropbox/QUANTESS/corpora/UK Manifestos/" files = os.listdir(path) for fname in files: f = open(path + fname, 'r') text = f.read() text = text.decode('latin1') temp = fname.split('_') country = temp[0] year = temp[2] party = temp[4].replace('.txt', '') d = quanteda.Document(text, fname=fname, variables={
import nltk import quanteda import codecs import sys import random import numpy as np path="/home/paul/Dropbox/populism/" neg_path = '~/Dropbox/QUANTESS/corpora/movieReviews/smaller/neg/' neg_path = os.path.expanduser(neg_path) # get machine independent path pos_path = '~/Dropbox/QUANTESS/corpora/movieReviews/smaller/pos/' pos_path = os.path.expanduser(pos_path) # get machine independent path movies=quanteda.Corpus() # add and label the negative reviews negs = movies.read_docs(neg_path, {"sent":"neg"}) movies.add_docs(*negs) # add and label the postive reviews pos = movies.read_docs(pos_path, {"sent":"pos"}) movies.add_docs(*pos) movies.preprocess() random.shuffle(movies.documents) print("1") movies.make_fdist() print("2") dfm = np.zeros((len(movies.documents), len(movies.vocab)))