Python pre示例，preporcessing.pre Python示例

示例#1

0

显示文件

文件： Save2DB.py 项目： zack0518/project

def save_(tweet):
    new_dic = {}
    time_period = timeFormat.get_period(tweet['created_at'])
    if tweet['lang'] == 'en':
        pred_text = preporcessing.pre(tweet['text'])
        sentiment = sentiment_analysis.get_sentiment_scores(pred_text)
        new_dic = {
            '_id': tweet['id_str'],
            'created_at': tweet['created_at'],
            'text': tweet['text'],
            'user': tweet['user'],
            'geo': tweet['geo'],
            'coordinates': tweet['coordinates'],
            'place': tweet['place'],
            'weekday': time_period[0],
            'month': time_period[1],
            'day': time_period[2],
            'hour': time_period[3],
            'year': time_period[4],
            'negative': sentiment['neg'],
            'positive': sentiment['pos'],
            'neu': sentiment['neu'],
            'compound': sentiment['compound']
        }
    return new_dic

示例#2

0

显示文件

def update_db(db):
    for id in db:
        tweet = db[id]
        if 'compound' not in tweet.keys():
            tweet['text']
            if tweet['lang'] == 'en':
                pred_text = preporcessing.pre(tweet['text'])
                sentiment = get_sentiment_scores(pred_text)
                tweet['negative'] = sentiment['neg']
                tweet['positive'] = sentiment['pos']
                tweet['neu'] = sentiment['neu']
                tweet['compound'] = sentiment['compound']
                db.save(tweet)

示例#3

0

显示文件

文件： Save2DB.py 项目： zack0518/project

 with open(file_path, 'r') as f:
     # with open('/Users/jiaqili/Desktop/project/melbourne2015-01-01_2015-01-03.json', 'r')as f:
     line = f.readline()
     while line:
         l = line.strip('\n, ')
         if l.startswith('{') and l.endswith('}'):
             l = json.loads(l)
             tweet = l['doc']
             time_period = timeFormat.get_period(tweet['created_at'])
             if tweet["geo"] != 'null':
                 name = get_area(tweet['geo']['coordinates'])
                 if name in config.coordinates.keys(
                 ) and name not in config.L_name:
                     db_name = name
             if tweet['lang'] == 'en':
                 pred_text = preporcessing.pre(tweet['text'])
                 sentiment = sentiment_analysis.get_sentiment_scores(
                     pred_text)
                 new_dic = {
                     '_id': tweet['id_str'],
                     'created_at': tweet['created_at'],
                     'text': tweet['text'],
                     'user': tweet['user'],
                     'geo': tweet['geo'],
                     'coordinates': tweet['coordinates'],
                     'place': tweet['place'],
                     'weekday': time_period[0],
                     'month': time_period[1],
                     'day': time_period[2],
                     'hour': time_period[3],
                     'year': time_period[4],

示例#4

0

显示文件

def get_user_timeline_tweets(db_raw, api, city_name):
    result = db_raw.view('original_tweets/username_not_used')
    for res in result:
        id = res['id']
        tweet = db_raw[id]
        name = tweet['user']['screen_name']
        try:
            for friend in Cursor(api.friends, screen_name=name).items(200):
                friend_id = friend._json['id']
                try:
                    for friend_raw_tweet in Cursor(
                            api.user_timeline, user_id=friend_id).items(200):
                        friend_raw_tweet._json['_id'] = str(
                            friend_raw_tweet._json['id'])
                        friend_raw_tweet._json['username'] = True
                        if city_name in city_list:
                            city_name = 'melbourne'
                        area_name = raw_tweet._json['place']['name']
                        new_dic = {}
                        if raw_tweet._json['lang'] == 'en':
                            time_period = timeFormat.get_period(
                                friend_raw_tweet._json['created_at'])
                            pred_text = pre(raw_tweet._json['text'])
                            sentiment = get_sentiment_scores(pred_text)
                            new_dic = {
                                '_id':
                                friend_raw_tweet._json['_id'],
                                'created_at':
                                friend_raw_tweet._json['created_at'],
                                'text':
                                friend_raw_tweet._json['text'],
                                'user':
                                friend_raw_tweet._json['user'],
                                'geo':
                                friend_raw_tweet._json['geo'],
                                'coordinates':
                                friend_raw_tweet._json['coordinates'],
                                'place':
                                friend_raw_tweet._json['place'],
                                'weekday':
                                time_period[0],
                                'month':
                                time_period[1],
                                'day':
                                time_period[2],
                                'hour':
                                time_period[3],
                                'year':
                                time_period[4],
                                'negative':
                                sentiment['neg'],
                                'positive':
                                sentiment['pos'],
                                'neu':
                                sentiment['neu'],
                                'compound':
                                sentiment['compound']
                            }
                        try:
                            if len(new_dic.keys()) > 0 and area_name.lower(
                            ) == city_name:
                                db_raw.save(new_dic)
                        except couchdb.http.ResourceConflict:
                            pass
                except:
                    pass
        except:
            pass
        try:
            for raw_tweet in Cursor(api.user_timeline,
                                    screen_name=name).items(200):
                # for tweet in tw.Cursor(self.api.search, q=query, lang=lang)
                # '''
                # tweet_temp = {'id': status.id_str, 'user': status._json['user'], 'place': status._json['place'],
                #                     'text': status.text, 'coordinates': status._json['coordinates']}
                # '''
                # '''
                raw_tweet._json['_id'] = str(raw_tweet._json['id'])
                raw_tweet._json['username'] = True
                area_name = raw_tweet._json['place']['name']
                time_period = timeFormat.get_period(tweet['created_at'])
                if city_name in city_list:
                    city_name = 'melbourne'
                new_dic = {}
                if raw_tweet._json['geo'] != 'null':
                    get_area(raw_tweet._json['geo']['coordinate'])
                if raw_tweet._json['lang'] == 'en':
                    pred_text = pre(raw_tweet._json['text'])
                    sentiment = get_sentiment_scores(pred_text)
                    new_dic = {
                        '_id': raw_tweet._json['_id'],
                        'created_at': raw_tweet._json['created_at'],
                        'text': raw_tweet._json['text'],
                        'user': raw_tweet._json['user'],
                        'geo': raw_tweet._json['geo'],
                        'coordinates': raw_tweet._json['coordinates'],
                        'place': raw_tweet._json['place'],
                        'weekday': time_period[0],
                        'month': time_period[1],
                        'day': time_period[2],
                        'hour': time_period[3],
                        'year': time_period[4],
                        'negative': sentiment['neg'],
                        'positive': sentiment['pos'],
                        'neu': sentiment['neu'],
                        'compound': sentiment['compound']
                    }
                try:
                    if len(new_dic.keys()) > 0 and area_name.lower(
                    ) == city_name:
                        db_raw.save(new_dic)
                except couchdb.http.ResourceConflict:
                    pass
            doc = db_raw.get(id)
            doc['username'] = True
            db_raw.save(doc)
        except:
            pass

示例#5

0

显示文件

文件： TopicModelling.py 项目： zack0518/project

def get_topic():
    # print(args)
    # conn = pymysql.connect(host='127.0.0.1', port=3306, user='******', passwd='toor', db='twitter', charset='utf8')
    # cursor = conn.cursor()
    # cursor.execute("set names utf8")
    # address = 'http://*****:*****@' + ip + ':5984/'
    # couch = couchdb.Server(address)
    # db = couch[name]

    # try:
    print('start tf_vectorizer')
    n_features = 500
    tf_vectorizer = CountVectorizer(max_features=n_features,
                                    stop_words='english')
    #        sql = """select t.* from (select processed_text, sa2_code as suburb from original_adelaide_coordinate where lang = 'en' and weekday = """ + args[0] + """ and created_at between '2014-01-01 00:00:00' and '2014-12-31 23:59:59' and sa2_code = '402041047') as t order by suburb"""
    #         if args[0] != '-1':
    #             sql = """select t.* from (select processed_text, sa2_code as suburb from original_""" + args[6] + """_coordinate where lang = 'en' and weekday = """ + args[0] + """ and created_at between '""" + args[1] + """ """ + args[2] + """' and '""" + args[3] + """ """ + args[4] + """' and sa2_code = '""" + args[5] + """') as t order by suburb"""
    #         else:
    #             sql = """select t.* from (select processed_text, sa2_code as suburb from original_""" + args[6] + """_coordinate where lang = 'en' and created_at between '""" + args[1] + """ """ + args[2] + """' and '""" + args[3] + """ """ + args[4] + """' and sa2_code = '""" + args[5] + """') as t order by suburb"""
    #         cursor.execute(sql)
    #         result = cursor.fetchall()
    #         text = []
    #         for item in result:
    #             text.append(item[0])
    #         try:
    #             cursor.execute(sql)
    #             conn.commit()
    #         except:
    #             conn.rollback()
    text = []
    print('start getting texts')
    with open(
            "/Users/jiaqili/Desktop/project/melbourne2015-01-01_2015-01-03.json",
            'r') as f:
        line = f.readline()
        while line:
            line = line.strip('\n, ')
            if line.startswith('{') and line.endswith('}'):
                line = json.loads(line)
                tweet = line['doc']
                # pred_text=tweet['text'].split()

                pred_text = preporcessing.pre(tweet['text'])

                text.append(pred_text)
        # if len(text)<1000:
            line = f.readline()
    print(len(text))
    print('start fit_transform')
    # a=text[0:1000]
    # print(a)
    tf = tf_vectorizer.fit_transform(text)
    print(type(tf))
    # tf = tf_vectorizer.fit_transform([f.read()])

    # n_topics = int(args[7])
    n_topics = 10
    print(n_topics)
    print('start lda')
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        max_iter=50,
        # learning_method='online',
        learning_method='batch',
        learning_offset=50.,
        random_state=0)
    print(type(lda))
    print('something')
    lda.fit(tf)
    print('something')

    def print_top_words(model, feature_names, n_top_words):
        temp = {}
        for topic_idx, topic in enumerate(model.components_):
            key = "Topic %d:" % topic_idx
            value = " ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ])
            temp[key] = value
        return print(temp)

    n_top_words = 10
    print(n_top_words)
    tf_feature_names = tf_vectorizer.get_feature_names()
    print(tf_feature_names)
    #        data = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
    #        pyLDAvis.show(data)
    return print_top_words(lda, tf_feature_names, n_top_words)