Exemplo n.º 1
0
    def tf_idf(file_path):
        records = ETLUtils.load_json_file(file_path)
        data = [record['text'] for record in records]
        vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
        train = vectorizer.fit_transform(data)
        #print "Vocabulary:", vectorizer.get_feature_names()
        num_samples, num_features = train.shape
        print("#samples: %d, #features: %d" % (num_samples, num_features))

        business_records = ETLUtils.filter_records(records, 'business_id',
                                                   ['uFJwKlHL6HyHSJmORO8-5w'])
        business_data = [record['text'] for record in business_records]
        freq_term_matrix = vectorizer.transform(business_data)
        vocabulary = vectorizer.get_feature_names()

        my_list = []
        rows, cols = freq_term_matrix.nonzero()
        for row, col in zip(rows, cols):
            my_dict = {}
            word = vocabulary[col]
            my_dict['tip_id'] = row
            my_dict['word'] = word
            my_dict['tfidf'] = freq_term_matrix[row, col]
            my_list.append(my_dict)

        data_frame = DataFrame(my_list)
        suma = data_frame.groupby('word').aggregate(np.sum)['tfidf']
        ordenado = suma.order()
        print ordenado

        #for row in freq_term_matrix:
        #print(row)

        #Stemmer
        stemmer = nltk.stem.SnowballStemmer('english')
Exemplo n.º 2
0
    def tf_idf(file_path):
        records = ETLUtils.load_json_file(file_path)
        data = [record['text'] for record in records]
        vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
        train = vectorizer.fit_transform(data)
        #print "Vocabulary:", vectorizer.get_feature_names()
        num_samples, num_features = train.shape
        print("#samples: %d, #features: %d" % (
            num_samples, num_features))

        business_records = ETLUtils.filter_records(records, 'business_id', ['uFJwKlHL6HyHSJmORO8-5w'])
        business_data = [record['text'] for record in business_records]
        freq_term_matrix = vectorizer.transform(business_data)
        vocabulary = vectorizer.get_feature_names()

        my_list = []
        rows, cols = freq_term_matrix.nonzero()
        for row, col in zip(rows, cols):
            my_dict = {}
            word = vocabulary[col]
            my_dict['tip_id'] = row
            my_dict['word'] = word
            my_dict['tfidf'] = freq_term_matrix[row, col]
            my_list.append(my_dict)

        data_frame = DataFrame(my_list)
        suma = data_frame.groupby('word').aggregate(np.sum)['tfidf']
        ordenado = suma.order()
        print ordenado

        #for row in freq_term_matrix:
            #print(row)

        #Stemmer
        stemmer = nltk.stem.SnowballStemmer('english')
Exemplo n.º 3
0
    def tf_idf_tips(file_path):
        records = ETLUtils.load_json_file(file_path)
        data = [record['text'] for record in records]
        vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
        vectorized = vectorizer.fit_transform(data)
        num_samples, num_features = vectorized.shape
        print("#samples: %d, #features: %d" % (num_samples, num_features))

        return vectorized
Exemplo n.º 4
0
    def tf_idf_tips(file_path):
        records = ETLUtils.load_json_file(file_path)
        data = [record['text'] for record in records]
        vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
        vectorized = vectorizer.fit_transform(data)
        num_samples, num_features = vectorized.shape
        print("#samples: %d, #features: %d" % (
            num_samples, num_features))

        return vectorized
Exemplo n.º 5
0
    def analyze(file_path):
        records = ETLUtils.load_json_file(file_path)
        ETLUtils.drop_fields(['text', 'type', 'date', 'user_id', 'likes'],
                             records)
        data_frame = DataFrame(records)
        counts = data_frame.groupby('business_id').size()
        counts.sort(ascending=0)
        top_counts = counts[:1000]
        print(top_counts)

        print records[0].keys()
Exemplo n.º 6
0
    def analyze(file_path):
        records = ETLUtils.load_json_file(file_path)
        ETLUtils.drop_fields(['text', 'type', 'date', 'user_id', 'likes'],
                             records)
        data_frame = DataFrame(records)
        counts = data_frame.groupby('business_id').size()
        counts.sort(ascending=0)
        top_counts = counts[:1000]
        print(top_counts)

        print records[0].keys()