Exemplo n.º 1
0
 def findSynonyms(self, word, num):
     """
     Find "num" number of words closest in similarity to "word".
     word can be a string or vector representation.
     Returns a dataframe with two fields word and similarity (which
     gives the cosine similarity).
     """
     if not isinstance(word, basestring):
         word = _convert_to_vector(word)
     return self._call_java("findSynonyms", word, num)
Exemplo n.º 2
0
 def findSynonymsArray(self, word, num):
     """
     Find "num" number of words closest in similarity to "word".
     word can be a string or vector representation.
     Returns an array with two fields word and similarity (which
     gives the cosine similarity).
     """
     if not isinstance(word, basestring):
         word = _convert_to_vector(word)
     tuples = self._java_obj.findSynonymsArray(word, num)
     return list(map(lambda st: (st._1(), st._2()), list(tuples)))
Exemplo n.º 3
0
    def unpack_bus_attributes(row):
        """ Unpacks Business attributes and assigns them an index value."""

        # List to store business attributes.
        unpacked = list()
        # Unpack all attributes except PriceRange and Parking
        temp = [row[s] for s in bus_attributes]

        # Process PriceRange
        try:
            priceRange = int(row["RestaurantsPriceRange2"])
        except (TypeError, ValueError):
            # If no price range specified - default=2
            priceRange = 2

        #Process Parking
        try:
            parking = 1 if (row["BusinessParking"].find("True")) != -1 else -1
        except AttributeError:
            parking = 0

        # Process WiFi
        if row["WiFi"] == 'no' or row["WiFi"] == "u'no'":
            wifi = -1
        elif row["WiFi"] == None:
            wifi = 0
        else:
            wifi = 1

        # Tokenize all Boolean attributes.
        for i in temp:
            if i == "True":
                unpacked.append(1)
            elif i == "False":
                unpacked.append(-1)
            else:
                unpacked.append(0)
        # Append the Parking and PriceRange attributes
        unpacked.append(wifi)
        unpacked.append(parking)
        unpacked.append(priceRange)

        # Print any arrays that are not of desired length (=30).
        if len(unpacked) != 30:
            print(unpacked)
        return _convert_to_vector(
            csc_matrix(np.asarray(unpacked).astype(float)).T)
Exemplo n.º 4
0
def dense_to_sparse(vector):
    return _convert_to_vector(scipy.sparse.csc_matrix(vector.toArray()).T)
Exemplo n.º 5
0
from pyspark.ml.linalg import VectorUDT, _convert_to_vector
from pyspark.sql.types import Row, StructField, StructType
import time

# Small vectors
num_features = 100
# The number of clusters
k = 10
num_points = 100000
num_iters = 10
FEATURES_COL = "features"

np.random.seed(2)
np_data = [x.tolist() for x in np.random.uniform(0.0, 1.0, size=(num_points, num_features))]
schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)])
mllib_rows = [Row(_convert_to_vector(x)) for x in np_data]
mllib_df = sqlContext.createDataFrame(mllib_rows, schema).coalesce(1).cache()

df = sqlContext.createDataFrame([[r] for r in np_data]).toDF(FEATURES_COL).coalesce(1)
# For now, analysis is still required. We cache the output because we are going to perform
# multiple runs on the dataset.
df0 = tfs.analyze(df).cache()


mllib_df.count()
df0.count()

np.random.seed(2)
init_centers = np.random.randn(k, num_features)
start_centers = init_centers
dataframe = df0
Exemplo n.º 6
0
# Small vectors
num_features = 100
# The number of clusters
k = 10
num_points = 100000
num_iters = 10
FEATURES_COL = "features"

np.random.seed(2)
np_data = [
    x.tolist()
    for x in np.random.uniform(0.0, 1.0, size=(num_points, num_features))
]
schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)])
mllib_rows = [Row(_convert_to_vector(x)) for x in np_data]
mllib_df = sqlContext.createDataFrame(mllib_rows, schema).coalesce(1).cache()

df = sqlContext.createDataFrame([[r] for r in np_data
                                 ]).toDF(FEATURES_COL).coalesce(1)
# For now, analysis is still required. We cache the output because we are going to perform
# multiple runs on the dataset.
df0 = tfs.analyze(df).cache()

mllib_df.count()
df0.count()

np.random.seed(2)
init_centers = np.random.randn(k, num_features)
start_centers = init_centers
dataframe = df0