def get_ltuo_hashtag_and_model_rank_accuracy_and_random_rank_accuracy(file): ltuo_hashtag_and_model_rank_accuracy_and_random_rank_accuracy = [] for data in FileIO.iterateLinesFromFile(file): # hashtag, model_rank_accuracy, random_rank_accuracy = data.split(',')[1:3] data = data.split(',')[2:5] ltuo_hashtag_and_model_rank_accuracy_and_random_rank_accuracy.append([float(i) for i in [data[2], data[0], data[1]]]) return ltuo_hashtag_and_model_rank_accuracy_and_random_rank_accuracy
def iterateFrequentLocationsFromFIMahout( minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, minCalculatedSupport, minLocationsInItemset=0, extraMinSupport=minSupport, yieldSupport=False, lids=False, ): # for line in FileIO.iterateLinesFromFile(locationsFIMahoutOutputFile%(minUserLocations, minCalculatedSupport)): for line in FileIO.iterateLinesFromFile( locationsFIMahoutOutputFile % (minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, minCalculatedSupport) ): if line.startswith("Key:"): data = line.split("Value: ")[1][1:-1].split(",") if not lids: locationItemset, support = ( [getLocationFromLid(i.replace("_", " ")) for i in data[0][1:-1].split()], int(data[1]), ) else: locationItemset, support = [i.replace("_", " ") for i in data[0][1:-1].split()], int(data[1]) if support >= extraMinSupport and len(locationItemset) >= minLocationsInItemset: if not yieldSupport: yield [location for location in locationItemset if isWithinBoundingBox(location, us_boundary)] else: yield [ location for location in locationItemset if isWithinBoundingBox(getLocationFromLid(location), us_boundary) ], support
def streamingLSHClusteringDemo(): clustering_settings = {'dimensions': 53, 'signature_length': 13, 'number_of_permutations': 5, 'threshold_for_document_to_be_in_cluster': 0.2} clustering=StreamingLSHClustering(**clustering_settings) docId = 0 docsToOriginalClusterMap = {} for line in FileIO.iterateLinesFromFile('../data/streaming.dat'): document = createDocumentFromLine(docId, line) docsToOriginalClusterMap[docId] = document.clusterId docId+=1 clustering.getClusterAndUpdateExistingClusters(document) clusterLabels = [] for k, cluster in clustering.clusters.iteritems(): clusterLabels.append([docsToOriginalClusterMap[doc.docId] for doc in cluster.iterateDocumentsInCluster()]) return EvaluationMetrics.getValueForClusters(clusterLabels, EvaluationMetrics.purity)
nns_settings = {'dimensions': 53, 'signature_length': 13, 'number_of_permutations': 5, 'signature_type': 'signature_type_lists', 'nearest_neighbor_threshold': 0.2} def createDocumentFromLine(docId, line): vector, words = Vector(), line.split() for word in words[1:]: if word not in vector: vector[word]=1 else: vector[word]+=1 return Document(words[0], vector) i = 0 documents = [] for line in FileIO.iterateLinesFromFile('../data/streaming.dat'): documents.append(createDocumentFromLine(None, line)); i+=1 if i==10: break class NearestNeighborUsingLSHTests(unittest.TestCase): def setUp(self): self.nnsLSH = NearestNeighborUsingLSH(**nns_settings) # def test_nns(self): # for d in documents: # self.nnsLSH.update(d) # self.assertEqual(d.docId, self.nnsLSH.getNearestDocument(d)) def test_getNearestDocumentWithReplacement(self): for d in documents: self.nnsLSH.update(d) for d in documents: print d.docId, self.nnsLSH.getNearestDocumentWithReplacement(d)
'signature_type': 'signature_type_lists', 'nearest_neighbor_threshold': 0.2 } def createDocumentFromLine(docId, line): vector, words = Vector(), line.split() for word in words[1:]: if word not in vector: vector[word] = 1 else: vector[word] += 1 return Document(words[0], vector) i = 0 documents = [] for line in FileIO.iterateLinesFromFile('../data/streaming.dat'): documents.append(createDocumentFromLine(None, line)) i += 1 if i == 10: break class NearestNeighborUsingLSHTests(unittest.TestCase): def setUp(self): self.nnsLSH = NearestNeighborUsingLSH(**nns_settings) # def test_nns(self): # for d in documents: # self.nnsLSH.update(d) # self.assertEqual(d.docId, self.nnsLSH.getNearestDocument(d))
def offlineLSHClusteringDemo(): wordToDimensionMap = {} def createDocumentFromLine(docId, line): vector = Vector() words = line.split() for word in words[1:]: if word not in wordToDimensionMap: wordToDimensionMap[word] = len(wordToDimensionMap) wordDimension = wordToDimensionMap[word] if wordDimension not in vector: vector[wordDimension] = 1 else: vector[wordDimension] += 1 return Document(docId, vector, clusterId=words[0]) dimensions = 53 signatureLength = 13 numberOfPermutations = 5 unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations( signatureLength, dimensions, unitVector) signaturePermutations = [ SignaturePermutationWithTrie(signatureLength) for i in range(numberOfPermutations) ] permutatedUnitVectors = [ unitVector.getPermutedVector(r) for r in vectorPermutations ] # Build LSH Model. # Read training documents. traningDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/train_offline.dat')): traningDocumentsMap[docId] = createDocumentFromLine(docId, l) # Construct cluster vectors. clusterToDocumentsMap = defaultdict(list) for document in traningDocumentsMap.values(): clusterToDocumentsMap[document.clusterId].append(document) clusterMap = {} for k, v in clusterToDocumentsMap.iteritems(): clusterMap[k] = Document(docId=k, vector=Vector.getMeanVector(v), clusterId=k) # Create signatures and signaturePermutations for all the clusters. map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), clusterMap.values()) for permutation in signaturePermutations: for document in clusterMap.values(): permutation.addDocument(document) # Testing the model. # Read testing documents. testDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/test_offline.dat')): testDocumentsMap[docId] = createDocumentFromLine(docId, l) # Create signatures for test documents map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), testDocumentsMap.values()) predicted, labels = [], [] for t in testDocumentsMap.values(): possibleNearestClusters = reduce( lambda x, y: x.union(y), (permutation.getNearestDocuments(t) for permutation in signaturePermutations), set()) predictedClass = max( ((clusterId, clusterMap[clusterId].cosineSimilarity(t)) for clusterId in possibleNearestClusters), key=itemgetter(1)) predicted.append(predictedClass[0]) labels.append(t.clusterId) return EvaluationMetrics.purity(predicted, labels)