示例#1
0
文件: tp2.py 项目: MrBoas/PLC
def main():
    ops, args = getopt.getopt(sys.argv[1:], 'b')
    ops = dict(ops)

    if '-b' in ops:
        corpus = TaggedCorpusReader('tagged/', r'.*\.tagged')
        tagged_sents_m = corpus.tagged_sents()
        m0 = nltk.DefaultTagger('N')
        m1 = nltk.UnigramTagger(tagged_sents_m, backoff=m0)
        m2 = nltk.BigramTagger(tagged_sents_m, backoff=m1)
        m3 = nltk.TrigramTagger(tagged_sents_m, backoff=m2)

        os.makedirs(dir, exist_ok=True)  # cria a diretoria
        output_file = open(corpus_path, 'wb')
        dump(m3, output_file, -1)
        output_file.close()
    else:
        # load do corpus
        corpus_input = open(corpus_path, 'rb')
        tagger_corpus = load(corpus_input)
        corpus_input.close()
        # load do input
        file_path = sys.argv[1]
        file_input = open(file_path, 'r')
        file_lines = file_input.readlines()
        print("### LOAD DONE ###")  # debug

        triplos = []
        for i in range(int(len(file_lines))):
            if file_lines[i] != '\n':  # process non empty lines
                triplos = processLine(file_lines[i], tagger_corpus, triplos)
        triplos.sort(key=sortTriplos)
        # triplos = remTriplosLastN(3,triplos)
        print(triplos)

        nodes = get_nodes(triplos)
        edgesW = triplos
        draw(nodes, edgesW)
 
 currentProgress(currentCity, totalCities, 'cities')
 totalReviews = 0
 
 for file in files:
     corpusdir = corpus_path+file[0:-5]+'/'
     if not os.path.isdir(corpusdir):
         missingCorpus(corpusdir)
     hotelcorpus = TaggedCorpusReader(corpusdir, '.*')
     
     stopset = getDefaultStopset(set(hotelcorpus.words("stopset.txt")))
     
     for review in hotelcorpus.fileids():
         if review == "stopset.txt":
             continue
         content = hotelcorpus.tagged_sents(review)
         if len(content) == 0:
             continue
         totalReviews += 1
         trimmedTokens = []
         for sentences in content:            
             for word, pos in sentences:
                 if word.isalpha() and word.lower() not in stopset:
                     trimmedTokens += [(word.lower(), pos)]
             trans += [lemmatize.getLemmas(trimmedTokens)]
             trimmedTokens = []
         
 lemmatize.saveLemmaDict()                
 freqDict = cityFreq(city, corpus_path,  files)
     
 TotalHotels = freqDict['TotalHotels']
示例#3
0
import nltk
from nltk.corpus import ConllChunkCorpusReader

from nltk.corpus.reader.tagged import TaggedCorpusReader
root = '/usr/local/share/nltk_data/corpora/MASC-for-NE/'
masc_for_ne = TaggedCorpusReader(root,'.*', '_')

sents = masc_for_ne.tagged_sents()
ne_sents = [nltk.ne_chunk(sent) for sent in sents]

root = "/usr/local/share/nltk_data/corpora/masc_conll/"
gold_corpus = ConllChunkCorpusReader(root,r".*\.conll", chunk_types=("DATE","PERSON","ORGANIZATION","LOCATION"))
gold_sents = gold_corpus.chunked_sents()