def get_all_hash(name):
    hashtag = name
    api = tweepy.API(auth)
    print("Tweets downloading has started !!")
    maxTweets = 100  # Some arbitrary large number
    tweetsPerQry = 100  # this is the max the API permits
    fName = 'HDFCTweets.txt'  # We'll store the tweets in a text file.

    # If results from a specific ID onwards are reqd, set since_id to that ID.
    # else default to no lower limit, go as far back as API allows
    sinceId = None

    # If results only below a specific ID are, set max_id to that ID.
    # else default to no upper limit, start from the most recent tweet matching the search query.
    max_id = -1

    tweets = []
    tweetCount = 0
    print("Downloading max {0} tweets".format(maxTweets))
    file_folder = ownModule.createFileFolder()
    if '\\' in file_folder:
        file_folder += '\\'
    else:
        file_folder += '/'
    with open(ownModule.removeFileIfExists(file_folder + fName), 'w') as f:
        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = api.search(q=hashtag, count=tweetsPerQry)
                    else:
                        new_tweets = api.search(q=hashtag, count=tweetsPerQry, since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = api.search(q=hashtag, count=tweetsPerQry, max_id=str(max_id - 1))
                    else:
                        new_tweets = api.search(q=hashtag, count=tweetsPerQry, max_id=str(max_id - 1), since_id=sinceId)
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet in new_tweets:
                    tweets.append(tweet.text)
                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id
            except tweepy.TweepError as e:
                # Just exit if any error
                print("some error : " + str(e))
                break

    return tweets
Пример #2
0
def get_classifier_NB(training_data):
    file_folder = ownModule.createFileFolder('classifiers')
    file_folder += os.path.sep
    fname = file_folder + "NB_classify"
    if os.path.isfile(fname):
        file_fp = open(file_folder + "NB_classify", "rb")
        classifier = pickle.load(file_fp)
        file_fp.close()
        return classifier
    else:
        classifier = nltk.NaiveBayesClassifier.train(training_data)
        file_fp = open(file_folder + "NB_classify", "wb")
        pickle.dump(classifier, file_fp)
        file_fp.close()
        return classifier
Пример #3
0
def get_classifier_BNB(training_data):
    file_folder = ownModule.createFileFolder('classifiers')
    file_folder += os.path.sep
    fname = file_folder + "BNB_classify"
    if os.path.isfile(fname):
        file_fp = open(file_folder + "BNB_classify", "rb")
        classifier = pickle.load(file_fp)
        file_fp.close()
        return classifier
    else:
        classifier = SklearnClassifier(BernoulliNB())
        classifier.train(training_data)
        file_fp = open(file_folder + "BNB_classify", "wb")
        pickle.dump(classifier, file_fp)
        file_fp.close()
        return classifier
def main():
    hash_tag = "#HDFC"
    tweets = get_all_hash(hash_tag)
    file_folder = ownModule.createFileFolder()
    if '\\' in file_folder:
        file_folder += '\\'
    else:
        file_folder += '/'
    save_file = open(ownModule.removeFileIfExists(file_folder + "HDFCTweets" + '.txt'), 'a')
    post_no = 1
    for tweet in tweets:
        print(tweet)
        save_file.write("POST No: " + str(post_no) + " ")
        save_file.write(tweet.encode('ascii', 'ignore').decode())
        save_file.write("\n")
        post_no += 1
    save_file.close()
Пример #5
0
def get_GNB(training_data):
    file_folder = ownModule.createFileFolder('classifiers')
    file_folder += os.path.sep
    fname = file_folder + "GNB_classify"
    # We need trainin data in the form of Numpy array
    # so the following processing needs to be done
    Y = [int(i[-1]) for i in training_data]
    training_data = [list(i) for i in training_data]
    lim = len(training_data)
    for i in range(0, lim):
        training_data[i] = list(training_data[i][0].values())
    X = training_data
    if os.path.isfile(fname):
        file_fp = open(file_folder + "GNB_classify", "rb")
        classifier = pickle.load(file_fp)
        file_fp.close()
        return classifier
    else:
        cf = GaussianNB()
        cf.fit(X, Y)
        file_fp = open(file_folder + "GNB_classify", "wb")
        pickle.dump(cf, file_fp)
        file_fp.close()
        return cf
def main():
    APP_ID = '416379438547915'
    APP_SECRET = 'e1c66cb8409855dcf19bf01cb8bdcd6a'
    bank_name = 'HDFC.bank'
    graph_url = "https://graph.facebook.com/" + bank_name
    json_fb_posts = get_all_posts(graph_url, APP_ID, APP_SECRET)

    post_no = 1
    starting_download_time = datetime.datetime.now()
    # pass the name of the folder you wish to create relative to the project directory to the below function
    file_folder = ownModule.createFileFolder('outputFiles')
    file_folder += os.path.sep
    with open(ownModule.removeFileIfExists(file_folder + bank_name + "FaceBookPosts.csv"), 'w', newline='') as csvfile:
        spam_writer = csv.writer(csvfile, delimiter='|',
                                 quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spam_writer.writerow(["Post", "Time"])
        for post in json_fb_posts:
            if 'message' in post.keys():
                print(post['message'])
                post['message'] = post['message'].replace("\n", ' ')
                spam_writer.writerow([post['message'], post['created_time']])
                if post_no == 1:
                    starting_download_time = post['created_time']
                post_no += 1
Пример #7
0
__author__ = 'vikas'
import csv
import os

import extract_features as ef
import nltk
import ownModule
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize

# Get the stemmer document
stemmer = PorterStemmer()
file_folder = ownModule.createFileFolder('outputFiles')
file_folder += os.path.sep
fp = open(file_folder + "First1000.csv", "r", encoding="utf-8")
input_reader = csv.reader(fp, delimiter="|")
input_reader = [row[0] for row in input_reader]
input_reader = input_reader
feature_set = []
for i in input_reader:
    m = sent_tokenize(i)
    word_list = []
    for k in m:
        words = nltk.tokenize.word_tokenize(k)
        for word in words:
            word_list.append(stemmer.stem_word(word.lower()))
    feature_set.append(word_list)

ans_reader = open(file_folder + "labels_of_data.csv", "r")
ans_csv = csv.reader(ans_reader, delimiter="|")
ans_reader = [row for row in ans_csv]
def main():
    stemmer = PorterStemmer()
    file_folder = ownModule.createFileFolder('outputFiles')
    file_folder += os.path.sep
    fp = open(file_folder + "First1000.csv", "r", encoding="utf-8")
    input_reader = csv.reader(fp, delimiter="|")
    input_reader = [row[0] for row in input_reader]

    feature_set = []
    all_words = {}

    stopwords = stw.words('english')
    # Count the occurence of each word in all of the labelled documents
    for i in input_reader:
        m = sent_tokenize(i)
        for k in m:
            words = nltk.tokenize.word_tokenize(k)
            for word in words:
                if word not in stopwords:
                    try:
                        all_words[stemmer.stem(word.lower())] += 1
                    except:
                        all_words[stemmer.stem(word.lower())] = 1

    file_handle = open(file_folder + "Unlabelled Posts.txt",
                       "r",
                       encoding="utf-8")
    file_handle = file_handle.readlines()[0:1000]
    # Count the occurence of each word in all of the unlabelled documents

    for i in file_handle:
        temp = i[1:]
        temp = temp[:-1]
        m = sent_tokenize(temp)
        for k in m:
            words = nltk.tokenize.word_tokenize(k)
            for word in words:
                if word not in stopwords:
                    try:
                        all_words[stemmer.stem(word.lower())] += 1
                    except:
                        all_words[stemmer.stem(word.lower())] = 1
    # total number of documents
    total_docs = len(input_reader) + len(file_handle)
    # total number of words
    total_words = len(all_words)
    feature_set = []
    for i in input_reader:
        m = sent_tokenize(i)
        for k in m:
            words = nltk.tokenize.word_tokenize(k)
            word_list = {}
            for word in words:
                if word not in stopwords:
                    try:
                        word_list[stemmer.stem(word.lower())] += 1
                    except:
                        word_list[stemmer.stem(word.lower())] = 1
        # Now for every word in all of the documents
        temp_features = []
        for key, val in all_words.items():
            try:
                temp_features.append(word_list[key] *
                                     math.log(total_docs / all_words[key]))
            except:
                temp_features.append(0)
        feature_set.append(temp_features)

    # Now do it for unlabelled data

    for i in file_handle:
        temp = i[1:]
        temp = temp[:-1]
        m = sent_tokenize(temp)
        for k in m:
            words = nltk.tokenize.word_tokenize(k)
            word_list = {}
            for word in words:
                if word not in stopwords:
                    try:
                        word_list[stemmer.stem(word.lower())] += 1
                    except:
                        word_list[stemmer.stem(word.lower())] = 1
        # Now for every word in all of the documents
        temp_features = []
        for key, val in all_words.items():
            try:
                temp_features.append(word_list[key] *
                                     math.log(total_docs / all_words[key]))
            except:
                temp_features.append(0)
        feature_set.append(temp_features)

    file_handle = [i[1:-1] for i in file_handle]
    newlist = input_reader + file_handle
    feature_extraction = CountVectorizer(stop_words='english')
    x = feature_extraction.fit_transform(newlist).toarray()
    label_prop_model = LabelPropagation(kernel='knn')
    # read the labels
    ans_reader = open(file_folder + "labels_of_data_department.csv", "r")
    ans_csv = csv.reader(ans_reader, delimiter="|")
    ans_reader = [int(row[0]) for row in ans_csv]
    for i in file_handle:
        ans_reader.append(-1)
    print(len(ans_reader))
    t = label_prop_model.fit(x, ans_reader)

    accuracy = ownModule.Vevaluate()
    print("Accuracy of Semi Supervised Learning:", accuracy)
    return accuracy