Python ETLUtils примеры использования

Язык программирования: Python

Пространство имен/Пакет: etl.etl_utils

Класс/Тип: ETLUtils

Примеров на hotexamples.com: 6

Python ETLUtils - 6 примеров найдено. Это лучшие примеры Python кода для etl.etl_utils.ETLUtils, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

load_json_file(3)

drop_fields(1)

filter_records(1)

Пример #1

Показать файл

Файл: tip_tfidf.py Проект: neostoic/yelp-1

    def tf_idf(file_path):
        records = ETLUtils.load_json_file(file_path)
        data = [record['text'] for record in records]
        vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
        train = vectorizer.fit_transform(data)
        #print "Vocabulary:", vectorizer.get_feature_names()
        num_samples, num_features = train.shape
        print("#samples: %d, #features: %d" % (num_samples, num_features))

        business_records = ETLUtils.filter_records(records, 'business_id',
                                                   ['uFJwKlHL6HyHSJmORO8-5w'])
        business_data = [record['text'] for record in business_records]
        freq_term_matrix = vectorizer.transform(business_data)
        vocabulary = vectorizer.get_feature_names()

        my_list = []
        rows, cols = freq_term_matrix.nonzero()
        for row, col in zip(rows, cols):
            my_dict = {}
            word = vocabulary[col]
            my_dict['tip_id'] = row
            my_dict['word'] = word
            my_dict['tfidf'] = freq_term_matrix[row, col]
            my_list.append(my_dict)

        data_frame = DataFrame(my_list)
        suma = data_frame.groupby('word').aggregate(np.sum)['tfidf']
        ordenado = suma.order()
        print ordenado

        #for row in freq_term_matrix:
        #print(row)

        #Stemmer
        stemmer = nltk.stem.SnowballStemmer('english')

Пример #2

Показать файл

Файл: tip_tfidf.py Проект: antoine-tran/yelp

    def tf_idf(file_path):
        records = ETLUtils.load_json_file(file_path)
        data = [record['text'] for record in records]
        vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
        train = vectorizer.fit_transform(data)
        #print "Vocabulary:", vectorizer.get_feature_names()
        num_samples, num_features = train.shape
        print("#samples: %d, #features: %d" % (
            num_samples, num_features))

        business_records = ETLUtils.filter_records(records, 'business_id', ['uFJwKlHL6HyHSJmORO8-5w'])
        business_data = [record['text'] for record in business_records]
        freq_term_matrix = vectorizer.transform(business_data)
        vocabulary = vectorizer.get_feature_names()

        my_list = []
        rows, cols = freq_term_matrix.nonzero()
        for row, col in zip(rows, cols):
            my_dict = {}
            word = vocabulary[col]
            my_dict['tip_id'] = row
            my_dict['word'] = word
            my_dict['tfidf'] = freq_term_matrix[row, col]
            my_list.append(my_dict)

        data_frame = DataFrame(my_list)
        suma = data_frame.groupby('word').aggregate(np.sum)['tfidf']
        ordenado = suma.order()
        print ordenado

        #for row in freq_term_matrix:
            #print(row)

        #Stemmer
        stemmer = nltk.stem.SnowballStemmer('english')

Пример #3

Показать файл

Файл: tip_tfidf.py Проект: neostoic/yelp-1

    def analyze(file_path):
        records = ETLUtils.load_json_file(file_path)
        ETLUtils.drop_fields(['text', 'type', 'date', 'user_id', 'likes'],
                             records)
        data_frame = DataFrame(records)
        counts = data_frame.groupby('business_id').size()
        counts.sort(ascending=0)
        top_counts = counts[:1000]
        print(top_counts)

        print records[0].keys()

Пример #4

Показать файл

Файл: tip_tfidf.py Проект: antoine-tran/yelp

    def analyze(file_path):
        records = ETLUtils.load_json_file(file_path)
        ETLUtils.drop_fields(['text', 'type', 'date', 'user_id', 'likes'],
                             records)
        data_frame = DataFrame(records)
        counts = data_frame.groupby('business_id').size()
        counts.sort(ascending=0)
        top_counts = counts[:1000]
        print(top_counts)

        print records[0].keys()

Пример #5

Показать файл

Файл: tip_tfidf.py Проект: neostoic/yelp-1

    def tf_idf_tips(file_path):
        records = ETLUtils.load_json_file(file_path)
        data = [record['text'] for record in records]
        vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
        vectorized = vectorizer.fit_transform(data)
        num_samples, num_features = vectorized.shape
        print("#samples: %d, #features: %d" % (num_samples, num_features))

        return vectorized

Пример #6

Показать файл

Файл: tip_tfidf.py Проект: antoine-tran/yelp

    def tf_idf_tips(file_path):
        records = ETLUtils.load_json_file(file_path)
        data = [record['text'] for record in records]
        vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
        vectorized = vectorizer.fit_transform(data)
        num_samples, num_features = vectorized.shape
        print("#samples: %d, #features: %d" % (
            num_samples, num_features))

        return vectorized