Пример #1
    def read_articles(loc_corpus, article_length):

        texts_online = glob(loc_corpus + '/*/*')
        texts_magazine = glob(loc_corpus + '/*/*')
        years_online = []
        years_magazine = []
        for text in texts_online:
            if text.endswith('.xml'):
                years_online += glob(text)
        for text in texts_magazine:
            if text.endswith('.xml'):
                years_magazine += glob(text)

        reader_online = XMLCorpusReader(loc_corpus, years_online)
        reader_magazine = XMLCorpusReader(loc_corpus, years_magazine)
        fileid_list = []
        for fileid in reader_magazine.fileids():
        for fileid in reader_online.fileids():

        articles = []

        for fileid in fileid_list:
            parser = ET.XMLParser(recover=True)
            tree = ET.parse(fileid, parser=parser)
            for elem in tree.iter(tag='artikel'):
                add_article = Article()
                for metadaten in elem.iter(tag='metadaten'):
                    for id in metadaten.iter(tag='artikel-id'):
                        add_article.id = id.text
                for metadaten in elem.iter(tag='inhalt'):
                    if metadaten.tag is None:
                    for child in metadaten.iter(tag='text'):
                        if child.tag is None:
                        for titel_liste in child.iter(tag='titel-liste'):
                            for title in titel_liste.iter(tag='titel'):
                                add_article.title = title.text
                        article_text = ""
                        for text in child.iter(tag='absatz'):
                            if text.text is None:
                            if text is not None:
                                article_text += text.text
                        if len(article_text) > article_length:
                            add_article.content = article_text
        return articles
def train():
    # parse XML and load up words
    print("Loading words from XML files...")
    sentences = []
    files = glob.glob("data/*.xml")
    i = 0
    for file in files:
        if i > 0 and i % 500 == 0:
            print("%d/%d files loaded, #-sentences: %d" %
                  (i, len(files), len(sentences)))
        dir, file = file.split("/")
        reader = XMLCorpusReader(dir, file)
        sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
        i += 1
    words = []
    for sentence in sentences:
    # build a trigram Language Model (using default Good-Turing
    # smoothing) with the words array
    print("Building language model...")
    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    langModel = NgramModel(3, words, estimator=est)
    #  langModel = NgramModel(3, words)
    #  cPickle.dump(langModel, open("lm.bin", 'wb'))
    return langModel
Пример #3
def doc_path_to_dict(path):
    directory, fname = os.path.split(path)
    reader = XMLCorpusReader(directory, fname)
    doc = reader.xml()
        return process_doc(doc)
    except ValueError, e:
        return e.args[0]
Пример #4
def train():
    if os.path.isfile("lm.bin"):
    files = glob.glob("data/*.xml")
    sentences = []
    i = 0
    for file in files:
        if i > 0 and i % 500 == 0:
            print("%d/%d files loaded, #-sentences: %d" %
                  (i, len(files), len(sentences)))
        dir, file = file.split("/")
        reader = XMLCorpusReader(dir, file)
        sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
        i += 1
    lm = LangModel(3, 0.4, sentences)
    cPickle.dump(lm, open("lm.bin", "wb"))
def buildWordList(corpus_root):
    stop_words = set(stopwords.words('english'))

    fileids = '.xml'
    xmlreader = XMLCorpusReader(corpus_root, fileids)
    termList = []
    for file in os.listdir(corpus_root):
        if file.endswith(".xml"):
            terms = Text(xmlreader.words(file))

    stop_words = set(stopwords.words('english'))
    terms = word_tokenize(str(termList))
    newTerms = []
    for w in terms:
        if (w not in (stop_words) and w.isalpha()):
    return (newTerms)

Пример #6
authlist = [
    'bob herbert', 'david brooks', 'nicholas d. kristof', 'thomas l. friedman',
    'paul krugman', 'maureen dowd', 'frank rich', 'verlyn klinkenborg',
    'adam cohen', 'lawrence downes'
roottest = './nyt_corpus/data/2005/**/**/'

nottestmode = False

authord = defaultdict(list)

icount = 0
ncount = 0
acount = 0
for filename in texts:
    reader = XMLCorpusReader(os.path.dirname(filename),
    xml = reader.xml()
    ptext = ""
    desk = ""
    body = xml.find('body')
    head = xml.find('head')
    auth = body.find('body.head').find('byline')
    for d in head:
        if d.get("name") == "dsk":
            desk = d.get("content")
    if desk == "Editorial Desk":
        icount += 1
            if auth is not None:
                auth = auth.text
                if auth is not None:
Пример #7
# How to use the Spanish Wordnet in NLTK?
from nltk.corpus.reader import XMLCorpusReader
reader = XMLCorpusReader(dir, file)
Пример #8
from cStringIO import StringIO

#Obtener todos los nombres de los documentos
texts = glob('conjuntoDatos/ingles/us*')
from nltk.corpus.reader import XMLCorpusReader

#declaro el corpus de toda la coleccion
terminos = []  #Lista de terminos
terminosUnicos = []  #Terminos unicos
terminosUnicos2 = []  #Terminos unicos
terminosProhibidos = []

#obtener el vocabulario de cada documnento
for item_path in texts:
    destino = os.path.basename(item_path)
    reader = XMLCorpusReader('conjuntoDatos/ingles', destino)
    palabras = reader.words()  #obtengo las palabras
    palabrasUnicas = sorted(
        set(palabras))  #obtengo las palabras sin repeticion
        set(terminos)))  #obtengo las palabras sin repeticion
    terminosUnicos2 = sorted(set(terminosUnicos))  #Terminos unicos
    reader = None  #Hago null a el apuntador
    terminos = None
    terminos = []

a = "'"
for x in terminosUnicos2:
    if a in x:
#Obtener todos los nombres de los documentos
texts = glob('conjuntoDatos/espanol/us*')
from nltk.corpus.reader import XMLCorpusReader

#declaro el corpus de toda la coleccion
terminos = []  #Lista de terminos
terminosUnicos = []  #Terminos unicos
terminosUnicos2 = []  #Terminos unicos
terminosProhibidos = []
palabrasTotales = 0

#obtener el vocabulario de cada documnento
for item_path in texts:
    destino = os.path.basename(item_path)
    reader = XMLCorpusReader('conjuntoDatos/espanol', destino)
    palabras = reader.words()  #obtengo las palabras
    palabrasTotales = palabrasTotales + len(palabras)
    palabrasUnicas = sorted(
        set(palabras))  #obtengo las palabras sin repeticion
        set(terminos)))  #obtengo las palabras sin repeticion
    terminosUnicos2 = sorted(set(terminosUnicos))  #Terminos unicos
    reader = None  #Hago null a el apuntador
    terminos = None
    terminos = []

a = "'"
for x in terminosUnicos2:
    if a in x: