def buildcorpus(corpus, rootpath, filelimit = 0): #rootpath = corpus.rootpath fileids = os.listdir(rootpath) hugewordlist = [] hugewordlist.extend(corpus.words) # will contain distinct Word instances numoffiles = 0 corpus.set_corpusname(str(max(filelimit, len(fileids)))+"texts") for fileid in fileids: allwords = nltk.FreqDist() # will contain all words in this text doc_id = fileid.split(".")[0] # corpus.inserttext(doc_id) ##### ! text in kendisini gondermeli newtext = Text(doc_id) path = rootpath + os.sep + fileid #lines = readtextlines(path) #rawtext = texter.readtxtfile(path) rawtext = texter.readnewstext(path) lines = texter.splitToSentences(rawtext) sntindex = 0 # each line is a sentence for line in lines: words = [] # words in this sentence words = line.split() words = texter.eliminatepunctuation(words) words = [word for word in words if not word.isspace()] for word in words: allwords.inc(word) newword = Word(word) newword.insertsentenceid(doc_id+"_"+str(sntindex)) if allwords[word] <= 1: # if this was not added to the hugelist before, add it hugewordlist.append(newword) sentence = Sentence(sntindex) sntindex = sntindex + 1 # sentence'a Word mu wordindex mi atalim? for word in words: index = hugewordlist.index(Word(word)) hugewordlist[index].insertsentenceid(doc_id+"_"+str(sntindex-1)) sentence.insertword(index) newtext.insertsentence(sentence) if (not rawtext.isspace()) or (len(allwords) != 0): corpus.inserttext(newtext) print str(numoffiles)," : finished handling the words-snts-txts ",doc_id numofwords = reduce(lambda x,y : x+y, allwords.values()) for word in hugewordlist: cnt = allwords[word.literal] #freq = cnt / float(numofwords) word.assigntermfreq(cnt, numofwords, doc_id) #hugewordlist[index].toscreen() numoffiles = numoffiles + 1 if filelimit == numoffiles: break # end for - docs numofdocs = len(fileids) print "computing tf*idf" for word in hugewordlist: word.computeinvdocfreq(numofdocs) word.computeTFIDF() #word.toscreen() corpus.assignwords(hugewordlist) print "corpus length ",str(len(corpus.words))," words" print "huges length ",str(len(hugewordlist))," words" print "exiting buildcorpus()" print "pickle-dumping words" corpus.pickledumpwords()