示例#1
0
def make_docs(docs):
    new_docs = []
    for d in docs:
        lines = d[0]
        filename = d[1]
        atts = lines[0]
        text = '\n'.join(lines[1:])
        new_docs.append(quanteda.Document(text, filename, atts))
    return new_docs
示例#2
0
def get_docs_names(path):
    manifs = quanteda.Corpus()
    for f in os.listdir(path):
        print f
        text = open(path + f).read()
        text = unicode(text, 'utf-8')
        bits = f.split('_')
        country = bits[0]
        level = bits[1]
        year = bits[2]
        lang = bits[3]
        party = bits[4].replace('.txt', '')
        d = quanteda.Document(text, fname=f, variables={"year":year, "country":country.upper(),\
         "party":party, "lang":lang, "level":level})
        d.preprocess()
        manifs.add_docs(d)
    return manifs
示例#3
0
def get_docs_folders(path):
    manifs = quanteda.Corpus()
    for ctrcode in os.listdir(path):
        print ctrcode
        for year in os.listdir(path + ctrcode):
            for manif in os.listdir(path + ctrcode + '/' + year):
                text = open(path + ctrcode + '/' + year + '/' + manif).read()
                res = chardet.detect(text)
                text = text.decode(res['encoding'])
                party = manif.split('_')[0]
                d = quanteda.Document(text,
                                      fname=manif,
                                      variables={
                                          "year": year,
                                          "country": ctrcode,
                                          "party": party
                                      })
                d.preprocess()
                manifs.add_docs(d)
    return manifs
示例#4
0
import os
import quanteda
import random
import zipfile
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

leftParties = [
    "Laba", "Lab", "Lib", "Comm", "LibSDP", "SF", "SEP", "TW", "Gr", "Resp"
]

ukMan = quanteda.Corpus()
with zipfile.ZipFile("/home/paul/UK_Manifestos.zip") as myzip:
    for n in myzip.namelist():
        d = quanteda.Document(myzip.open(n).read(), n)
        ukMan.documents.append(d)
        n = n.replace('Con_a', 'Cona')
        n = n.replace('Lab_a', 'Laba')
        n = n.replace('.txt', '')
        v = n.split('_')
        wing = "None"
        if v[4] in leftParties: wing = "Left"
        else: wing = "Right"
        d.add_variables({
            "elecType": v[1],
            "year": v[2],
            "lang": v[3],
            "party": v[4],
            "wing": wing
        })
示例#5
0
path = "/home/paul/Dropbox/QUANTESS/corpora/UK Manifestos/"

files = os.listdir(path)
for fname in files:
    f = open(path + fname, 'r')
    text = f.read()
    text = text.decode('latin1')
    temp = fname.split('_')
    country = temp[0]
    year = temp[2]
    party = temp[4].replace('.txt', '')
    d = quanteda.Document(text,
                          fname=fname,
                          variables={
                              "year": year,
                              "country": country,
                              "party": party
                          })
    d.preprocess()
    manifs.add_docs([d])

print manifs

popwords = read_dictionary('/home/paul/Dropbox/populism/dictionary.txt')

for d in manifs.documents:
    popcounts = 0
    nonpopcounts = 0
    words = d.text.split()
    for w in words:
示例#6
0
            dictionary[label[2]] = words
    return (dictionary)


manifs = quanteda.Corpus()

path = "/home/paul/Dropbox/populism/"

for ctrcode in os.listdir(path + '/txt/'):
    for year in os.listdir(path + '/txt/' + ctrcode):
        for manif in os.listdir(path + '/txt/' + ctrcode + '/' + year):
            text = open(path + '/txt/' + ctrcode + '/' + year + '/' +
                        manif).read()
            d = quanteda.Document(text,
                                  fname=manif,
                                  variables={
                                      "year": year,
                                      "country": ctrcode
                                  })
            if d.variables['country'] == "IRL":
                d.preprocess()
                manifs.add_docs([d])

popwords = read_dictionary('/home/paul/Dropbox/populism/dictionary.txt')

for d in manifs.documents:
    popcounts = 0
    nonpopcounts = 0
    words = d.text.split()
    for w in words:
        match = False
        for pw in popwords[d.variables['country']]: