def main(): dict = {}; idf = {}; st = ""; ntdocs = 1.0; docs = os.listdir('papers'); ndocs = len(docs); n = 0 for i in range(ndocs): n = n + 1 print docs[i] print n f = 'papers' + '/' + docs[i]; statinfo = os.stat(f); if statinfo.st_size <= 152576: fp = open(f,"r"); st = fp.read(); dict = tfCalculator.findtf(st); s = docs[i].strip('.txt'); s = 'pre/' + s + '.json' f = open(s,"w"); f.write(json.dumps(dict)); f.close(); dict = {}; fp.close();
def main(): #Enter the path of the directory ntf = tl1 = []; dict = idf = {}; st = ""; ntdocs = 1; docs = os.listdir("C:\\Python27\\sampleset"); ndocs = len(docs); for i in range(ndocs): f = 'C:\\Python27\\sampleset\\' + docs[i]; fp = open(f,"r"); st = fp.read(); dict = tfCalculator.findtf(st); #Normalising term frequency m = max(dict.values()) + 0.0; for j,v in dict.items(): dict[j] = v/m; ntf.append(dict); #ntf = [{'word':ntf}] fp.close(); #Calculating Inverse Document Frequency idf for i in range(ndocs): tl1 = ntf[i].keys(); for j in range(len(tl1)): if idf.has_key(tl1[j]) == False: for k in range(i+1,ndocs): if ntf[k].has_key(tl1[j]) == True: ntdocs = ntdocs + 1; idf[tl1[j]] = math.log10(1.0 * (ndocs / ntdocs)); ntdocs = 1; #Input File for Bayon Clustering Tool fp = open("input.txt","w"); for i in range(ndocs): fp.write(docs[i]); fp.write('\t'); tl1 = ntf[i].keys(); for j in range(len(tl1)): fp.write(tl1[j]); fp.write('\t'); tfidf = ntf[i].get(tl1[j]) * idf.get(tl1[j]); st = str(tfidf); fp.write(st); fp.write('\t'); fp.write('\n'); fp.close();