def train():
    # parse XML and load up words
    print("Loading words from XML files...")
    sentences = []
    files = glob.glob("data/*.xml")
    i = 0
    for file in files:
        if i > 0 and i % 500 == 0:
            print("%d/%d files loaded, #-sentences: %d" %
                  (i, len(files), len(sentences)))
            break
        dir, file = file.split("/")
        reader = XMLCorpusReader(dir, file)
        sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
        i += 1
    words = []
    for sentence in sentences:
        words.append(nltk.word_tokenize(sentence))
    # build a trigram Language Model (using default Good-Turing
    # smoothing) with the words array
    print("Building language model...")
    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    langModel = NgramModel(3, words, estimator=est)
    #  langModel = NgramModel(3, words)
    #  cPickle.dump(langModel, open("lm.bin", 'wb'))
    return langModel
예제 #2
0
def train():
  # parse XML and load up words
  print("Loading words from XML files...")
  sentences = []
  files = glob.glob("data/*.xml")
  i = 0
  for file in files:
    if i > 0 and i % 500 == 0:
      print("%d/%d files loaded, #-sentences: %d" %
        (i, len(files), len(sentences)))
      break
    dir, file = file.split("/")
    reader = XMLCorpusReader(dir, file)
    sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
    i += 1
  words = []
  for sentence in sentences:
    words.append(nltk.word_tokenize(sentence))
  # build a trigram Language Model (using default Good-Turing
  # smoothing) with the words array
  print("Building language model...")
  est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
  langModel = NgramModel(3, words, estimator=est)
#  langModel = NgramModel(3, words)
#  cPickle.dump(langModel, open("lm.bin", 'wb'))
  return langModel
예제 #3
0
def doc_path_to_dict(path):
    directory, fname = os.path.split(path)
    reader = XMLCorpusReader(directory, fname)
    doc = reader.xml()
    try:
        return process_doc(doc)
    except ValueError, e:
        return e.args[0]
예제 #4
0
파일: framenet.py 프로젝트: Tahnan/nltk
    def __init__(self, root, fileids):
        XMLCorpusReader.__init__(self, root, fileids)

        # framenet corpus sub dirs
        # sub dir containing the xml files for frames
        self._frame_dir = "frame"
        # sub dir containing the xml files for lexical units
        self._lu_dir = "lu"
        # sub dir containing the xml files for fulltext annotation files
        self._fulltext_dir = "fulltext"

        # Indexes used for faster look-ups
        self._frame_idx = None
        self._lu_idx = None
        self._fulltext_idx = None
        self._semtypes = None
예제 #5
0
def train():
    if os.path.isfile("lm.bin"):
        return
    files = glob.glob("data/*.xml")
    sentences = []
    i = 0
    for file in files:
        if i > 0 and i % 500 == 0:
            print("%d/%d files loaded, #-sentences: %d" %
                  (i, len(files), len(sentences)))
        dir, file = file.split("/")
        reader = XMLCorpusReader(dir, file)
        sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
        i += 1
    lm = LangModel(3, 0.4, sentences)
    cPickle.dump(lm, open("lm.bin", "wb"))
예제 #6
0
def train():
  if os.path.isfile("lm.bin"):
    return
  files = glob.glob("data/*.xml")
  sentences = []
  i = 0
  for file in files:
    if i > 0 and i % 500 == 0:
      print("%d/%d files loaded, #-sentences: %d" %
        (i, len(files), len(sentences)))
    dir, file = file.split("/")
    reader = XMLCorpusReader(dir, file)
    sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
    i += 1
  lm = LangModel(3, 0.4, sentences)
  cPickle.dump(lm, open("lm.bin", "wb"))
예제 #7
0
    def __init__(self, root, fileids):
        XMLCorpusReader.__init__(self, root, fileids)

        # framenet corpus sub dirs
        # sub dir containing the xml files for frames
        self._frame_dir = "frame"
        # sub dir containing the xml files for lexical units
        self._lu_dir = "lu"
        # sub dir containing the xml files for fulltext annotation files
        self._fulltext_dir = "fulltext"

        # Indexes used for faster look-ups
        self._frame_idx = None
        self._lu_idx = None
        self._fulltext_idx = None
        self._semtypes = None
예제 #8
0
파일: loadwords.py 프로젝트: antfriend/NLP
def Get_text(corpus_root = '/release/', file_IDs = '.*'):    
    #wordlists = PlaintextCorpusReader(corpus_root, '.*')
    wordlists = XMLCorpusReader(corpus_root, file_IDs)
    print "processing " + corpus_root + file_IDs
    raw = wordlists.raw()
    #print "corpus rawed ..."
    tokens = nltk.word_tokenize(raw)
    #print "corpus tokenized ..."
    text = nltk.Text(tokens)
    #print "corpus textified ..."
    
    simple_md = [word.lower() for word in text if word.isalpha()]
    #print "corpus lowercased and alphafied ..."
    simple_md = [word for word in simple_md if word != 'source']
    #print "keyword *source* removed ..."
    print "DONE!"
    return simple_md
def buildWordList(corpus_root):
    stop_words = set(stopwords.words('english'))

    fileids = '.xml'
    xmlreader = XMLCorpusReader(corpus_root, fileids)
    #print(xmlreader)
    termList = []
    for file in os.listdir(corpus_root):
        if file.endswith(".xml"):
            terms = Text(xmlreader.words(file))
            termList.append(terms)

    #print(termList)
    stop_words = set(stopwords.words('english'))
    terms = word_tokenize(str(termList))
    newTerms = []
    for w in terms:
        if (w not in (stop_words) and w.isalpha()):
            newTerms.append(w)
    #print(newTerms)
    return (newTerms)


#print(buildWordList(corpus_root))
예제 #10
0
    def read_articles(loc_corpus, article_length):

        texts_online = glob(loc_corpus + '/*/*')
        texts_magazine = glob(loc_corpus + '/*/*')
        years_online = []
        years_magazine = []
        for text in texts_online:
            if text.endswith('.xml'):
                years_online += glob(text)
        for text in texts_magazine:
            if text.endswith('.xml'):
                years_magazine += glob(text)

        reader_online = XMLCorpusReader(loc_corpus, years_online)
        reader_magazine = XMLCorpusReader(loc_corpus, years_magazine)
        fileid_list = []
        for fileid in reader_magazine.fileids():
            fileid_list.append(fileid)
        for fileid in reader_online.fileids():
            fileid_list.append(fileid)

        articles = []

        for fileid in fileid_list:
            parser = ET.XMLParser(recover=True)
            tree = ET.parse(fileid, parser=parser)
            for elem in tree.iter(tag='artikel'):
                add_article = Article()
                for metadaten in elem.iter(tag='metadaten'):
                    for id in metadaten.iter(tag='artikel-id'):
                        add_article.id = id.text
                for metadaten in elem.iter(tag='inhalt'):
                    if metadaten.tag is None:
                        break
                    for child in metadaten.iter(tag='text'):
                        if child.tag is None:
                            break
                        for titel_liste in child.iter(tag='titel-liste'):
                            for title in titel_liste.iter(tag='titel'):
                                add_article.title = title.text
                        article_text = ""
                        for text in child.iter(tag='absatz'):
                            if text.text is None:
                                break
                            if text is not None:
                                article_text += text.text
                        if len(article_text) > article_length:
                            add_article.content = article_text
                articles.append(add_article)
       
        return articles
예제 #11
0
authlist = [
    'bob herbert', 'david brooks', 'nicholas d. kristof', 'thomas l. friedman',
    'paul krugman', 'maureen dowd', 'frank rich', 'verlyn klinkenborg',
    'adam cohen', 'lawrence downes'
]
roottest = './nyt_corpus/data/2005/**/**/'

nottestmode = False

authord = defaultdict(list)

icount = 0
ncount = 0
acount = 0
for filename in texts:
    reader = XMLCorpusReader(os.path.dirname(filename),
                             os.path.basename(filename))
    xml = reader.xml()
    ptext = ""
    desk = ""
    body = xml.find('body')
    head = xml.find('head')
    auth = body.find('body.head').find('byline')
    for d in head:
        if d.get("name") == "dsk":
            desk = d.get("content")
    if desk == "Editorial Desk":
        icount += 1
        try:
            if auth is not None:
                auth = auth.text
                if auth is not None:
예제 #12
0
# How to use the Spanish Wordnet in NLTK?
from nltk.corpus.reader import XMLCorpusReader
reader = XMLCorpusReader(dir, file)
예제 #13
0
from cStringIO import StringIO

#Obtener todos los nombres de los documentos
texts = glob('conjuntoDatos/ingles/us*')
from nltk.corpus.reader import XMLCorpusReader

#declaro el corpus de toda la coleccion
terminos = []  #Lista de terminos
terminosUnicos = []  #Terminos unicos
terminosUnicos2 = []  #Terminos unicos
terminosProhibidos = []

#obtener el vocabulario de cada documnento
for item_path in texts:
    destino = os.path.basename(item_path)
    reader = XMLCorpusReader('conjuntoDatos/ingles', destino)
    palabras = reader.words()  #obtengo las palabras
    palabrasUnicas = sorted(
        set(palabras))  #obtengo las palabras sin repeticion
    terminos.extend(palabrasUnicas)
    terminosUnicos.extend(sorted(
        set(terminos)))  #obtengo las palabras sin repeticion
    terminosUnicos2 = sorted(set(terminosUnicos))  #Terminos unicos
    reader = None  #Hago null a el apuntador
    terminos = None
    terminos = []

a = "'"
for x in terminosUnicos2:
    if a in x:
        terminosProhibidos.append(x)
예제 #14
0
 def words(self, fileids=None, categories=None):
     words = []
     fileids = self._resolve(fileids, categories)
     for fileid in fileids:
         words += XMLCorpusReader.words(self, fileid)
     return words
예제 #15
0
 def raw(self, fileids=None, categories=None):
     return XMLCorpusReader.raw(self, self._resolve(fileids, categories))
예제 #16
0
 def __init__(self, *args, **kwargs):
     MyCategorizedCorpusReader.__init__(self, kwargs)
     XMLCorpusReader.__init__(self, *args, **kwargs)