示例#1
0
def preprocess_csv(csv_file_name, processed_file_name, test_file=False):
    save_to_file = open(processed_file_name, 'w')
    import chardet
    with open(csv_file_name, 'rb') as rawdata:
        result = chardet.detect(rawdata.read(100000))
        print(result)
    with open(csv_file_name, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            tweet_id = line[:line.find(',')]
            if not test_file:
                line = line[1 + line.find(','):]
                positive = int(line[:line.find(',')])
            line = line[1 + line.find(','):]
            tweet = line
            processed_tweet = preprocess_tweet(tweet)
            if not test_file:
                save_to_file.write('%s,%d,%s\n' %
                                   (tweet_id, positive, processed_tweet))
            else:
                save_to_file.write('%s,%s\n' %
                                   (tweet_id, processed_tweet))
            write_status(i + 1, total)
    save_to_file.close()
    print('\nSaved processed tweets to: %s' % processed_file_name)
    return processed_file_name
def preprocess_csv(csv_file_name, processed_file_name, test_file):

    """ Creates a dictionary with slangs and their equivalents and replaces them """
    with open('slang.txt') as file:
         slang_dict = dict(map(str.strip, line.partition('\t')[::2]) for line in file if line.strip())

    save_to_file = open(processed_file_name, 'w')

    with open(csv_file_name, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            tweet_id = line[:line.find(',')]
            if not test_file:
                line = line[1 + line.find(','):]
                positive = int(line[:line.find(',')])
            line = line[1 + line.find(','):-1]
            tweet = line
            tweet = preprocess_tweet(tweet, slang_dict)
            processed_tweet = preprocess_tweet(tweet, slang_dict)
            #print(processed_tweet)
            if (test_file==0):
                save_to_file.write('%s,%d,%s\n' %
                                    (tweet_id, positive, processed_tweet))
            else:
                save_to_file.write('%s,%s\n' %
                                    (tweet_id, processed_tweet))
            write_status(i + 1, total)
    save_to_file.close()
    print ('\nSaved processed tweets to: %s' % processed_file_name)
    return processed_file_name
示例#3
0
def process_tweets(csv_file, test_file=True):
    """ Returns a list of tuples of type (tweet_id, feature_vector)
            or (tweet_id, sentiment, feature_vector) """
    tweets = []
    print('Generating feature vectors')
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                tweet_id = line[:line.find(',')]
                tweet = line[1 + line.find(','):]
            else:
                tweet_id = line[:line.find(',')]
                sentiment = int(line[:line.find(',')])
                tweet = line[1 + line.find(','):]
                # tweet_id, sentiment, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            if test_file:
                tweets.append((tweet_id, feature_vector))
            else:
                tweets.append((tweet_id, int(sentiment), feature_vector))
            utils.write_status(i + 1, total)
    print('\n')
    return tweets
示例#4
0
def process_tweets(csv_file, test_file=True):
    """
    Generates training X, y pairs.
    """
    tweets = []
    labels = []
    print ('Generating feature vectors')
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                tweet_id, sentiment, tweet = line.split(',')
            else:
                tweet_id, sentiment, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            if test_file:
                tweets.append(feature_vector)
                labels.append(int(sentiment))
            else:
                tweets.append(feature_vector)
                labels.append(int(sentiment))
            utils.write_status(i + 1, total)
    print ('\n')
    return tweets, np.array(labels)
def preprocess_csv(csv_file_name, processed_file_name, test_file=False):
    save_to_file = open(processed_file_name, 'w')
    pos = int()
    with open(csv_file_name, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            tweet_id = line[:line.find(',')]
            if not test_file:
                line = line[1 + line.find(','):]
                positive = line[:line.find(',')]
                if "1" in positive:
                    pos = int(1)
                elif "0" in positive:
                    pos = int(0)
            line = line[1 + line.find(','):]
            tweet = line
            processed_tweet = preprocess_tweet(tweet)
            if not test_file:
                save_to_file.write('%s,%d,%s\n' %
                                   (tweet_id, pos, processed_tweet))
            else:
                save_to_file.write('%s,%s\n' % (tweet_id, processed_tweet))
            write_status(i + 1, total)
    save_to_file.close()
    print('\nSaved processed tweets to: %s' % processed_file_name)
    return processed_file_name
def process_CONTENT(csv_file, test_file=True):
    """Returns a list of tuples of type (content_id, feature_vector)
            or (content_id, sentiment, feature_vector)

    Args:
        csv_file (str): Name of processed csv file generated by preprocess.py
        test_file (bool, optional): If processing test file

    Returns:
        list: Of tuples
    """
    CONTENT = []
    print 'Generating feature vectors'
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                CONTENT_id, CONTENTS = line.split(',')
            else:
                CONTENT_id, sentiment, CONTENTS = line.split(',')
            feature_vector = get_feature_vector(CONTENTS)
            if test_file:
                CONTENT.append((CONTENT_id, feature_vector))
            else:
                CONTENT.append((CONTENT_id, int(sentiment), feature_vector))
            utils.write_status(i + 1, total)
    print '\n'
    return CONTENT
示例#7
0
def preprocess_df(structured_file_name):
    overall_file_name = sys.argv[1][:-4] + '-overall.csv'
    room_file_name = sys.argv[1][:-4] + '-room.csv'
    cleanliness_file_name = sys.argv[1][:-4] + '-cleanliness.csv'
    service_file_name = sys.argv[1][:-4] + '-service-linear.csv'
    
    save_to_file = open(service_file_name, 'w')

    with open(structured_file_name, 'r',encoding='utf-8') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if i==0: continue
            line = line.split(',')
            post_id, content, n_likes, sentiment_hand, relevance = line[0],line[10],line[5],line[9],line[8]
            sentiment_hand = int(float(sentiment_hand))
            if sentiment_hand > 3: sentiment = 1
            else: sentiment = 0
            processed_content = preprocess_tweet(content)
            save_to_file.write('%s,%s,%s\n' % (post_id,int(int(n_likes)>0),processed_content))
            
            write_status(i + 1, total)
            
    save_to_file.close()
    return
示例#8
0
def process_tweets(csv_file, test_file=True):
    """Returns a list of tuples of type (tweet_id, feature_vector)
            or (tweet_id, sentiment, feature_vector)

    Args:
        csv_file (str): Name of processed csv file generated by preprocess.py
        test_file (bool, optional): If processing test file

    Returns:
        list: Of tuples
    """
    tweets = []
    print('Generating feature vectors')
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                tweet_id, tweet = line.split(',')
            else:
                tweet_id, sentiment, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            if test_file:
                tweets.append((tweet_id, feature_vector))
            else:
                tweets.append((tweet_id, int(sentiment), feature_vector))
            utils.write_status(i + 1, total)
    print('\n')
    return tweets
示例#9
0
def process_CONTENT(csv_file, test_file=True):
    """
    Generates training X, y pairs.
    """
    CONTENT = []
    labels = []
    print 'Generating feature vectors'
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                CONTENT_id, CONTENTS = line.split(',')
            else:
                CONTENT_id, sentiment, CONTENTS = line.split(',')
            feature_vector = get_feature_vector(CONTENTS)
            if test_file:
                CONTENT.append(feature_vector)
		labels.append(np.random.randn(1))
            else:
                CONTENT.append(feature_vector)
                labels.append(int(sentiment))
            utils.write_status(i + 1, total)
    print '\n'
    return CONTENT, np.array(labels)
示例#10
0
def preprocess_csv(
        csv_file_name,
        processed_file_name,
        test_file=True):  #make False when training , true when testing
    save_to_file = open(processed_file_name, 'w')

    with open(csv_file_name, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            tweet_id = line[:line.find(',')]
            if not test_file:
                line = line[1 + line.find(','):]
                positive = int(line[:line.find(',')])
            line = line[1 + line.find(','):]
            tweet = line
            processed_tweet = preprocess_tweet(tweet)
            if not test_file:
                save_to_file.write('%s,%d,%s\n' %
                                   (tweet_id, positive, processed_tweet))
            else:
                save_to_file.write('%s,%s\n' % (tweet_id, processed_tweet))
            write_status(i + 1, total)
    save_to_file.close()
    print('\nSaved processed tweets to: %s' % processed_file_name)
    return processed_file_name
def main(file_name: str):

    num_tweets, num_pos_tweets, num_neg_tweets = 0, 0, 0
    all_words: List[str] = []
    all_bigrams: List[Tuple[str, str]] = []

    with open(file_name) as csv:

        lines = csv.readlines()
        num_tweets = len(lines)

        for i, line in enumerate(lines):

            t_id, if_pos, tweet = line.strip().split(',')
            if_pos = int(if_pos)

            if if_pos:
                num_pos_tweets += 1
            else:
                num_neg_tweets += 1

            words, bigrams = analyze_tweet(tweet)
            all_words.extend(words)
            all_bigrams.extend(bigrams)
            write_status(i + 1, num_tweets)

    unique_words = list(set(all_words))
    unique_words_file_name = 'unique.txt'

    with open(os.path.join('dataset', unique_words_file_name), 'w') as uwf:
        uwf.write('\n'.join(unique_words))
    sys.stdout.write('\nCalculating frequency distribution')
    sys.stdout.flush()

    # Unigrams
    freq_dist = FreqDist(all_words)
    pkl_file_name = 'freqdist.pkl'

    with open(os.path.join('dataset', pkl_file_name), 'wb') as pkl_file:
        pickle.dump(freq_dist, pkl_file)
    sys.stdout.write(f'Saved uni-frequency distribution to {pkl_file_name}')
    sys.stdout.flush()

    # Bigrams
    bigram_freq_dist = get_bigram_freqdist(all_bigrams)
    bi_pkl_file_name = 'freqdist-bi.pkl'

    with open(os.path.join('dataset', bi_pkl_file_name), 'wb') as pkl_file:
        pickle.dump(bigram_freq_dist, pkl_file)

    sys.stdout.write(f'Saved bi-frequency distribution to {bi_pkl_file_name}')
    sys.stdout.write('\n[Analysis Statistics]')
    sys.stdout.write(f'Tweets => Total: {num_tweets}, Positive: {num_pos_tweets}, Negative: {num_neg_tweets}')
    sys.stdout.flush()
def process_tweets_test(csv_file):
    tweets = []
    print('Generating feature vectors')
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            tweet_id, sentiment, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            tweets.append((tweet_id, feature_vector))
            utils.write_status(i + 1, total)
    print('\n')
    return tweets
示例#13
0
def get_glove_vectors(vocab):
    print 'Looking for GLOVE seeds'
    glove_vectors = {}
    found = 0
    with open(GLOVE_FILE, 'r') as glove_file:
        for i, line in enumerate(glove_file):
            utils.write_status(i + 1, 0)
            tokens = line.strip().split()
            word = tokens[0]
            if vocab.get(word):
                vector = [float(e) for e in tokens[1:]]
                glove_vectors[word] = np.array(vector)
                found += 1
    print '\n'
    return glove_vectors
示例#14
0
def get_glove_vectors(vocab):
    print('Looking for GLOVE vectors')
    glove_vectors = {}
    found = 0
    with open(GLOVE_FILE, 'r', encoding="utf8") as glove_file:
        for i, line in enumerate(glove_file):
            utils.write_status(i + 1, 0)
            tokens = line.split()
            word = tokens[0]
            if vocab.get(word):
                vector = [float(e) for e in tokens[1:]]
                glove_vectors[word] = np.array(vector)
                found += 1
    print('\n')
    print('Found %d words in GLOVE' % found)
    return glove_vectors
def get_glove_vectors(vocab):
    print('Looking for pre-trained vectors')
    pretrained_vectors = {}
    found = 0
    with open(WORD_VECTORS, 'r', encoding='utf-8') as glove_file:
        for i, line in enumerate(glove_file):
            utils.write_status(i + 1, 0)
            tokens = line.split()
            word = tokens[0]
            if vocab.get(word):
                vector = [float(e) for e in tokens[1:]]
                pretrained_vectors[word] = np.array(vector)
                found += 1
    print('\n')
    print('Found %d words on pre-trained word vectors' % found)
    return pretrained_vectors
示例#16
0
def get_glove_vectors(vocab):
    """
    Extracts glove vectors from seed file only for words present in vocab.
    """
    print('Looking for GLOVE seeds')
    glove_vectors = {}
    found = 0
    with open(GLOVE_FILE, 'r') as glove_file:
        for i, line in enumerate(glove_file):
            utils.write_status(i + 1, 0)
            tokens = line.strip().split()
            word = tokens[0]
            if vocab.get(word):
                vector = [float(e) for e in tokens[1:]]
                glove_vectors[word] = np.array(vector)
                found += 1
    return glove_vectors
def process_tweets(csv_file, test_file=True):
    tweets = []
    print 'Generating feature vectors'
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                tweet_id, tweet = line.split(',')
            else:
                tweet_id, sentiment, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            if test_file:
                tweets.append((tweet_id, feature_vector))
            else:
                tweets.append((tweet_id, int(sentiment), feature_vector))
            utils.write_status(i + 1, total)
    print '\n'
    return tweets
示例#18
0
def process_CONTENT(csv_file, test_file=True):
    CONTENT = []
    print 'Generating feature vectors'
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                CONTENT_id, CONTENTS = line.split(',')
            else:
                CONTENT_id, sentiment, CONTENTS = line.split(',')
            feature_vector = get_feature_vector(CONTENTS)
            if test_file:
                CONTENT.append((CONTENT_id, feature_vector))
            else:
                CONTENT.append((CONTENT_id, int(sentiment), feature_vector))
            utils.write_status(i + 1, total)
    print '\n'
    return CONTENT
示例#19
0
def preprocess_csv(csv_file_name):
    train = "processed_train_data.csv"
    test = "processed_test_data.csv"
    rows = []
    with open(csv_file_name, 'r', encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            rows.append(row)
    # Chỉ lấy các tweet positive và negative
    tweets = [row for row in rows if "neutral" not in row[7]]
    # 20000 train
    # ~10000 test
    train_tweets = tweets[:20000]
    test_tweets = tweets[20000:]
    total = len(tweets)
    success = 0
    save_to_file = open(train, 'w', encoding="utf8")
    for i, tweet in enumerate(train_tweets):
        tweet_content = tweet[1]
        positive = 0 if "positive" not in tweet[7] else 1
        tweet_id = uuid.uuid4()
        processed_tweet = preprocess_tweet(tweet_content)
        save_to_file.write('%s,%d,%s\n' %
                           (tweet_id, positive, processed_tweet))
        success += 1
        write_status(success, total)
    save_to_file.close()

    save_to_file = open(test, 'w', encoding="utf8")
    for i, tweet in enumerate(test_tweets):
        tweet_content = tweet[1]
        positive = 0 if "positive" not in tweet[7] else 1
        tweet_id = uuid.uuid4()
        processed_tweet = preprocess_tweet(tweet_content)
        save_to_file.write('%s,%d,%s\n' %
                           (tweet_id, positive, processed_tweet))
        success += 1
        write_status(success, total)
    save_to_file.close()

    print('\nSaved processed tweets to: %s\n%s' % (test, train))
示例#20
0
def preprocess_csv(csv_file_name, processed_file_name, test_file):
    save_to_file = open(processed_file_name, 'w')
    df = pd.read_csv(csv_file_name, sep="\t")
    total = len(df.index)
    for i, line in df.iterrows():
        if not test_file:
            tweet = line[1]
            sentiment = int(line[0])
        else:
            tweet = line[0]
        processed_tweet = preprocess_tweet(tweet)
        if not test_file:
            #save_to_file.write('%d,%s\n' %(sentiment, processed_tweet))
            save_to_file.write(str(sentiment) + ',' + processed_tweet + '\n')
        else:
            save_to_file.write('%s\n' % (processed_tweet))
        write_status(i + 1, total)

    save_to_file.close()
    print('\nSaved processed tweets to: %s' % processed_file_name)
    return processed_file_name
示例#21
0
def process_reviews(csv_file, test_file):
    """Returns a list of tuples of type (review_id, feature_vector)
            or (review_id, sentiment, feature_vector)"""
    reviews = []
    print('Generating feature vectors')
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                review_id, review = line.split(',')
            else:
                review_id, sentiment, review = line.split(',')
            feature_vector = get_feature_vector(review)
            #print(feature_vector)
            if test_file:
                reviews.append((review_id, feature_vector))
            else:
                reviews.append((review_id, int(sentiment), feature_vector))
            utils.write_status(i + 1, total)
    print('\n')
    return reviews
def preprocess_csv(csv_file_name, processed_file_name, test_file):

    """ Creates a dictionary with slangs and their equivalents and replaces them """
    with open('slang.txt') as file:
         slang_dict = dict(map(str.strip, line.partition('\t')[::2]) for line in file if line.strip())
    
    #tweet = "happy birthday <user> :) keep handsome , longlife and everything . wish u all the best : > allah bless you . traktir!!!!!! <33333"
    #tweet = "5555555555 gsshg5655 i havent even met <user> yet but i miss them sooo soo much ! ! ! i feel like a got a hole in my heart ! ! would give up everything 4u"
    tweet = "<user> oh ... wells , the kkeut <3  coming from - _ ** beast still makes me scared , i lost my prince anyways , i cant lose others too 4u hhjhhj67 :/"
    new_tweet = preprocess_tweet(tweet, slang_dict)
    print(new_tweet)

    
    
    save_to_file = open(processed_file_name, 'w')

    with open(csv_file_name, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            tweet_id = line[:line.find(',')]
            if not test_file:
                line = line[1 + line.find(','):]
                positive = int(line[:line.find(',')])
            line = line[1 + line.find(','):-1]
            tweet = line
            tweet = preprocess_tweet(tweet, slang_dict)
            processed_tweet = preprocess_tweet(tweet, slang_dict)
            #print(processed_tweet)
            if (test_file==0):
                save_to_file.write('%s,%d,%s\n' %
                                    (tweet_id, positive, processed_tweet))
            else:
                save_to_file.write('%s,%s\n' %
                                    (tweet_id, processed_tweet))
            write_status(i + 1, total)
    save_to_file.close()
    print ('\nSaved processed tweets to: %s' % processed_file_name)
    return processed_file_name
示例#23
0
def preprocess_csv(csv_name, processed_name, test_file):
    save_to_file = open(processed_name, 'w')

    with open(csv_name, 'r', encoding="ISO-8859-1") as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            tweet_id = line[:line.find(',')]
            if not test_file:
                line = line[1 + line.find(','):]
                label = int(line[:line.find(',')])
            line = line[1 + line.find(','):]
            tweet = line
            processed_tweet = preprocess_tweet(tweet)
            if not test_file:
                save_to_file.write('%s, %d, %s\n' %
                                   (tweet_id, label, processed_tweet))
            else:
                save_to_file.write('%s, %s\n' % (tweet_id, processed_tweet))
            write_status(i + 1, total)
    save_to_file.close()
    print('\n saved processed tweets to: %s' % processed_name)
    return processed_name
示例#24
0
def preprocess_csv(csv_file_name, processed_file_name, test_file=True):
    save_to_file = open(processed_file_name, 'w')

    with open(csv_file_name, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            COMMENT_ID = line[:line.find(',')]
            if not test_file:
                line = line[1 + line.find(','):]
                positive = int(line[:line.find(',')])
            line = line[1 + line.find(','):]
            CONTENT = line
            processed_CONTENT = preprocess_CONTENT(CONTENT)
            if not test_file:
                save_to_file.write('%s,%d,%s\n' %
                                   (COMMENT_ID, positive, processed_CONTENT))
            else:
                save_to_file.write('%s,%s\n' % (COMMENT_ID, processed_CONTENT))
            write_status(i + 1, total)
    save_to_file.close()
    print '\nSaved processed comments to: %s' % processed_file_name
    return processed_file_name
def preprocess_df(structured_file_name):
    overall_file_name = sys.argv[1][:-4] + '-overall.csv'
    room_file_name = sys.argv[1][:-4] + '-room.csv'
    cleanliness_file_name = sys.argv[1][:-4] + '-cleanliness.csv'
    service_file_name = sys.argv[1][:-4] + '-service-linear.csv'

    save_to_file = open(service_file_name, 'w')

    with open(structured_file_name, 'r', encoding='utf-8') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if i == 0: continue
            line = line.split(',')
            post_id, content, n_posts = line[0], line[10], line[14]
            processed_content = preprocess_tweet(content)
            save_to_file.write(
                '%s,%s,%s\n' %
                (post_id, n_posts.replace('\n', ''), processed_content))

            write_status(i + 1, total)

    save_to_file.close()
    return
示例#26
0
def preprocess_csv(csv_file_name, processed_file_name, test_file=True):
    save_to_file = open(processed_file_name, 'w')

    with open(csv_file_name, 'r',errors = 'ignore',encoding="utf8") as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
          if i > 1:  
            tweet_id = line[:line.find(',')]
            '''if not test_file:
                line = line[1 + line.find(','):]
                positive = int(line[:line.find(',')])
            '''
            line = line[1 + line.find(','):]
            tweet = line
            processed_tweet = preprocess_tweet(tweet)
            ''' if not test_file:
                save_to_file.write('%s,%d,%s\n' %(tweet_id, positive, processed_tweet))
            else: '''
            save_to_file.write('%s,%s\n' %(tweet_id, processed_tweet))
            write_status(i + 1, total)
    save_to_file.close()
    print('\nSaved processed tweets to: %s' %processed_file_name)
    return processed_file_name
示例#27
0
            o = model.train_on_batch(training_set_X, training_set_y)
            sys.stdout.write('\rIteration %d/%d, loss:%.4f, acc:%.4f' %
                             (i, n_train_batches, o[0], o[1]))
            sys.stdout.flush()
            i += 1
        val_acc = evaluate_model(model, val_tweets)
        print ('\nEpoch: %d, val_acc:%.4f' % (j + 1, val_acc))
        random.shuffle(train_tweets)
        if val_acc > best_val_acc:
            print ('Accuracy improved from %.4f to %.4f, saving model' % (best_val_acc, val_acc))
            best_val_acc = val_acc
            model.save('best_model.h5')
    print ('Testing')
    del train_tweets
    del model
    model = load_model('best_model.h5')
    test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True)
    n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
    predictions = np.array([])
    print ('Predicting batches')
    i = 1
    for test_set_X, _ in extract_features(test_tweets, feat_type=FEAT_TYPE, batch_size=batch_size, test_file=True):
        prediction = np.round(model.predict_on_batch(test_set_X).flatten())
        predictions = np.concatenate((predictions, prediction))
        utils.write_status(i, n_test_batches)
        i += 1
    predictions = [(str(j), int(predictions[j]))
                   for j in range(len(test_tweets))]
    utils.save_results_to_csv(predictions, 'logistic.csv')
    print ('\nSaved to logistic.csv')
     train_tweets, val_tweets = utils.split_data(tweets)
 else:
     random.shuffle(tweets)
     train_tweets = tweets
 del tweets
 print('Extracting features & training batches')
 clf = RandomForestClassifier(n_jobs=2, random_state=0)
 batch_size = len(train_tweets)
 i = 1
 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
 for training_set_X, training_set_y in extract_features(
         train_tweets,
         test_file=False,
         feat_type=FEAT_TYPE,
         batch_size=batch_size):
     utils.write_status(i, n_train_batches)
     i += 1
     if FEAT_TYPE == 'frequency':
         tfidf = apply_tf_idf(training_set_X)
         training_set_X = tfidf.transform(training_set_X)
     clf.fit(training_set_X, training_set_y)
 print('\n')
 print('Testing')
 if TRAIN:
     correct, total = 0, len(val_tweets)
     i = 1
     batch_size = len(val_tweets)
     n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
     for val_set_X, val_set_y in extract_features(val_tweets,
                                                  test_file=False,
                                                  feat_type=FEAT_TYPE,
示例#29
0
         num_mentions += result['MENTIONS']
         max_mentions = max(max_mentions, result['MENTIONS'])
         #!!!!!!!!seems no meaning, why count all???
         num_pos_emojis += result['POS_EMOS']
         num_neg_emojis += result['NEG_EMOS']
         max_emojis = max(max_emojis,
                          result['POS_EMOS'] + result['NEG_EMOS'])
         num_urls += result['URLS']
         max_urls = max(max_urls, result['URLS'])
         num_words += result['WORDS']
         min_words = min(min_words, result['WORDS'])
         max_words = max(max_words, result['WORDS'])
         all_words.extend(words)
         num_bigrams += result['BIGRAMS']
         all_bigrams.extend(bigrams)
         write_status(i + 1, num_tweets)
 num_emojis = num_pos_emojis + num_neg_emojis
 unique_words = list(set(all_words))
 with open(sys.argv[1][:-4] + '-unique.txt', 'w') as uwf:
     uwf.write('\n'.join(unique_words))
 num_unique_words = len(unique_words)
 num_unique_bigrams = len(set(all_bigrams))
 print('\nCalculate Frequency Distribution')
 #Unigrams
 freq_dist = FreqDist(all_words)
 pkl_file_name = sys.argv[1][:-4] + '-freqdist.pkl'
 with open(pkl_file_name, 'wb') as pf:
     pickle.dump(freq_dist, pf)
 print('save uni-frequency distribution to %s' % pkl_file_name)
 #Bigrams
 bigram_freq_dist = get_bigram_freqdist(all_bigrams)
示例#30
0
         result, words, bigrams = analyze_review(review)
         num_mentions += result['MENTIONS']
         max_mentions = max(max_mentions, result['MENTIONS'])
         num_pos_emojis += result['POS_EMOS']
         num_neg_emojis += result['NEG_EMOS']
         max_emojis = max(max_emojis,
                          result['POS_EMOS'] + result['NEG_EMOS'])
         num_urls += result['URLS']
         max_urls = max(max_urls, result['URLS'])
         num_words += result['WORDS']
         min_words = min(min_words, result['WORDS'])
         max_words = max(max_words, result['WORDS'])
         all_words.extend(words)
         num_bigrams += result['BIGRAMS']
         all_bigrams.extend(bigrams)
         write_status(i + 1, num_reviews)
 num_emojis = num_pos_emojis + num_neg_emojis
 unique_words = list(set(all_words))
 with open(sys.argv[1][:-4] + '-unique.txt', 'w') as uwf:
     uwf.write('\n'.join(unique_words))
 num_unique_words = len(unique_words)
 num_unique_bigrams = len(set(all_bigrams))
 print('\nCalculating frequency distribution')
 # Unigrams
 freq_dist = FreqDist(all_words)
 pkl_file_name = sys.argv[1][:-4] + '-freqdist.pkl'
 with open(pkl_file_name, 'wb') as pkl_file:
     pickle.dump(freq_dist, pkl_file)
 print('Saved uni-frequency distribution to %s' % pkl_file_name)
 # Bigrams
 bigram_freq_dist = get_bigram_freqdist(all_bigrams)