Exemplo n.º 1
def extract():
    print 'Extracting features from app descriptions...\n'
    if os.path.exists(OUTPUT_PATH):

    for dir in os.listdir(INPUT_PATH):
        if not dir.startswith('.'):
            os.makedirs("{}/{}".format(OUTPUT_PATH, dir))
            for file in os.listdir('{}/'.format(INPUT_PATH) + dir):
                with open('{}/{}/{}'.format(INPUT_PATH, dir, file), 'rb') as f:
                    reader = csv.reader(f)
                    with open('{}/{}/{}'.format(OUTPUT_PATH, dir, file),
                              'wb') as r:
                        writer = csv.writer(r)
                        for app in reader:
                            name = app[0]
                            description = app[2]

                            # Prepare an app description string for NLTK and LDA processing
                            preparedDescription = prepare_description(

                            # Extract 3 word featurlets from the description
                            featurelets = featurelet_extraction(

                            list = []
                            for feature in featurelets:
                                featurelet = '{} {} {}'.format(
                                    feature[0], feature[1], feature[2])
                                    Document(featurelet, name=featurelet))

                            # Perform hierarchical clustering
                            m = Model(list)
                            cluster = m.cluster(method=HIERARCHICAL,

                            # Organize clusters into features and alternative tokens
                             alterTokens) = group(cluster, [], [], [])

                            # Write results to file
                                [name, description, features, alterTokens])
Exemplo n.º 2
	def kmeansCluster(self, documentList, k, iteration, distance, seed, p):
		if distance.lower() == "cosine":
			distance = COSINE
		elif distance.lower() == "euclidean":
			distance = EUCLIDEAN
		elif distance.lower() == "manhattan":
			distance = MANHATTAN
			return "invalid distance"

		if seed.lower() == "kmpp":
			seed = KMPP
		elif seed.lower() =="random":
			seed = RANDOM
			return "invalid random"
		if type(k) is not int:
			return "k is not int"

		if type(iteration) is not int:
			return "iterartion is not int"

		if type(p) is not float and type(p) is not int:
			return "p is not float"

		if type(documentList) is not list:
			return "document List is not list"

		self.iteration = iteration
		self.seed = seed
		self.p = p
		self.distance = distance
		model = Model(documentList)
		cluster = model.cluster(method=KMEANS, k=k, iterations=iteration, distance=distance,seed=seed,p=p)
		return cluster
Exemplo n.º 3
d4 = Document('The dog is happy.', name='dog2')
m = Model([d1, d2, d3, d4])
for d in m.documents:
    print d.name
    for concept, w1 in m.lsa.vectors[d.id].items():
        for feature, w2 in m.lsa.concepts[concept].items():
            if w1 != 0 and w2 != 0:
                print(feature, w1 * w2)
# clustering
d1 = Document('Cats are independent pets.', name='cat')
d2 = Document('Dogs are trustworthy pets.', name='dog')
d3 = Document('Boxes are made of cardboard.', name='box')
m = Model((d1, d2, d3))
print m.cluster(method=HIERARCHICAL, k=2)
# hierarchical clustering
cluster = Cluster((1, Cluster((2, Cluster((3, 4))))))
print cluster.depth
print cluster.flatten(1)
# training a classifier
nb = NB()
for review, rating in csv('data/input/reviews.csv'):
    v = Document(review, type=int(rating), stopwords=True)
print nb.classes
print nb.classify(Document('A good movie!'))
# testing a classifier
data = csv('data/input/reviews.csv')
data = [(review, int(rating)) for review, rating in data]
data = [
class ClusterLSI(object):
	def __init__(self):
		"""Setting up ClusterLSI environment
		self.field = "scopeAndContent"
		self.limit = False
		self.identifier = "idDoc"
		self.model = False
		self.cluster = False
		self.depth = 0
		self.outputNodes = "./clus-nodes.csv"
		self.outputEdges = "./clus-edges.csv"
	def normalize(self, s):
		"""Normalize a string
		Keyword arguments:
		s	---	string
		if type(s) == unicode: 
			return s.encode('utf8', 'ignore')
			return str(s)

	def modeling(self, descriptions, field = False, limit = False):
		"""Model returns a pattern.vector.Model object which is a list of pattern.vector.Document using Ehri.Get() descriptions
		Keyword arguments:
		descriptions ---	EHRI.get() description object
		field 	---		Field to look into, override defaut self.field
		limit	---		Debug option. Limit the model to $limit items
		if field:
			self.field = field
		if limit:
			self.limit = limit
		D = []
		#Creating Pattern Document element from data we got from Neo4J
		#For debug reasons, we could set a limit
		if self.limit:
			i = 0
		for description in descriptions:
			D.append(Document(description[self.field], name=description[self.identifier]))
			#And stop the iteration when i reaches the limit
			if self.limit:
				i += 1
				if i == self.limit:
		#Then, creating a model from our array
		self.model = Model(D)
		return self.model 
	def clusterize(self, model = False):
		"""Returns a cluster of given model
		Keyword arguments:
		model	---	If set, override instance model
		if model:
			self.model = model
		self.cluster = self.model.cluster(method=HIERARCHICAL, k=2)
		return self.cluster

	def flatten(self, array, typeOf = "str"):
		"""Returns a 1 dimension list with given type of item inside given array
		Keyword arguments:
		array	---	A list of items
		typeOf	---	Type of item the function should return
		#Flatten an array
		if typeOf == "str":
			return [element for element in array if isinstance(element, basestring)]
		elif typeOf == "list":
			return [element for element in array if isinstance(element, list)]

	def csv(self, array, parents = False, fake = 0):
		"""Return a tuple of csv string with given items and number of fake items
		Keyword arguments:
		array	---	A list of items
		parents	---	A list of parents
		fake	---	An index for fake parents 
		string = "" 
		#Making list of elements, avoid calling it once more
		currents = self.flatten(array, "str")
		children = self.flatten(array, "list")
		if len(currents) == 0:
			fake += 1
		Ffake = fake
		#If we have parents, we have parents connections
		if parents:
			for element in currents:
				for parent in parents:
					string += self.normalize(element) + ";" + parent + "\n"
		#Taking care of children
		for child in children:
			if len(currents) > 0:
				Sstring, Ffake = self.csv(child, currents, Ffake)
				Sstring, Ffake = self.csv(child, ["fake-"+str(fake)], Ffake)
			string += Sstring
		return string, Ffake

	def clusterToArray(self, Graph):
		"""Convert a cluster object to an array list with n-depth where depth is same as cluster.depth
		Keyword arguments:
		Graph	---	Cluster or list
		array = []
		Docs = [element for element in Graph if isinstance(element, pattern.vector.Document)]
		Clusts = [element for element in Graph if isinstance(element, list)]
		for node in Docs:
		for node in Clusts:
		return array
	def save(self, descriptions, csv, fakes = 0, nodesName = False, edgesName = False ):
		"""Output cluster into csv files
		Keyword arguments:
		descriptions	---	EHRI.get() description item
		fakes	---	Number of fakes parents
		nodesName	---	Filename for Nodes's CSV file
		edgesName	---	Filename for Edges's CSV file
		if nodesName:
			self.outputNodes = nodesName
		if edgesName:
			self.outputEdges = edgesName
		f = open(self.outputNodes, "wt")
		for description in descriptions:
			f.write(self.normalize(description[self.identifier] + ";" + description[self.identifier] + ";1\n"))
		while i <= fakes:
			f.write("fake-" + str(i) + ";" + "fake" + str(i) + ";0\n")
			i+= 1

		f = open(self.outputEdges, "wt")