def quantize(abstractions): print 'calculating random trigrams...' allGrams = dict([]) c, i, u, l = 1, 20, 0, len(abstractions) for abstraction in abstractions: u = feedbackMessage(c, i, u, l, 'generating n-grams:') allGrams = trigrammatize(abstraction, allGrams) c += 1 ranks = [] cc, i, u, l = 1, 20, 0, len(allGrams.keys()) x = 0 levels = dict([]) for gram1 in allGrams.keys(): u = feedbackMessage(cc, i, u, l, 'calculating n-gram frequencies:') for gram2 in allGrams[gram1].keys(): for gram3 in allGrams[gram1][gram2].keys(): score = allGrams[gram1][gram2][gram3] if score in levels.keys(): levels[score] += 1 else: levels[score] = 1 gram = gram1 + '_' + gram2 + '_' + gram3 ranks.append((score, gram)) del allGrams[gram1][gram2][gram3] x += 1 cc += 1 # print 'done. calculating tipping point...' # tippingPoint = findConfluenceDynamic(levels) # print 'done.\nfiltering data.\nextracting best feature sets...' ranks = [rank for rank in ranks if rank[0] >= tippingPoint] ranks.sort() ranks.reverse() print 'done.\nclustering...' return ranks
def allWords(lines): words = [] c, i, u, l = 1, 20, 0, len(lines) for text in lines: u = feedbackMessage(c, i, u, l, 'loading file:') words += tokenize(text) c += 1 return words
def clusterize(freqDist, originalAbstractions, originals, \ docsByLine, lines, returnIndividualSenteces): trigrams = [pair[1] for pair in freqDist] clusters = startClustering(trigrams) abstractions = [set(abstraction) for abstraction in originalAbstractions] territory = [originals[z] for z in range(0, len(abstractions))] indexes = range(0, len(abstractions)) print len(abstractions), 'initial vectors,', len(trigrams), \ 'initial feature sets' c, i, u, l = 1, 20, 0, len(trigrams) # abstractions = list of sentences without noise while abstractions and trigrams: c += 1 u = feedbackMessage(c, i, u, l, 'clustering: ' + \ str(len(abstractions)) + ' remaining sentence vectors, ' + \ str(len(trigrams)) + ' remaining feature sets -') trigram = trigrams[0] features = set(trigram.split('_')) unclassifiedAbstractions = [] unclassifiedOriginals = [] unclassifiedIndexes = [] for j in range(0, len(abstractions)): abstraction = abstractions[j] # print features,'\t',abstraction,'\t',features.intersection(abstraction) if features.intersection(abstraction) == features: where = clusters[trigram] if returnIndividualSenteces == True: where.documents.append(territory[j]) where.indexes.append(indexes[j]) else: original = lines[docsByLine[indexes[j]]] original = original[original.index(', ') + 2:original.index(', ') + 102] where.documents.append(original) where.indexes.append(docsByLine[indexes[j]]) where.vectors.append(abstraction) where.space += list(abstraction) # else: # unclassifiedAbstractions.append(abstraction) # unclassifiedOriginals.append(territory[j]) # unclassifiedIndexes.append(indexes[j]) # abstractions = unclassifiedAbstractions # territory = unclassifiedOriginals # indexes = unclassifiedIndexes trigrams = trigrams[1:] if returnIndividualSenteces == False: for key in clusters.keys(): clusters[key].documents = sorted(list(set(clusters[key].documents))) clusters[key].indexes = sorted(list(set(clusters[key].indexes))) return clusters
def advancedRead(path, relevant): rd = open(path, 'r') lines = rd.readlines() rd.close() newLines = [] entities = dict([]) c, i, u, l = 1, 20, 0, len(lines) for line in lines: u = feedbackMessage(c, i, u, l, 'segmenting file:') # recognition = entityRecognition(line.strip(), entities, noise) # newLine = recognition[0] # entities = recognition[1] newLines.append(line) # newLines.append(newLine) c += 1 return [newLines, entities]