def get_all_hash(name): hashtag = name api = tweepy.API(auth) print("Tweets downloading has started !!") maxTweets = 100 # Some arbitrary large number tweetsPerQry = 100 # this is the max the API permits fName = 'HDFCTweets.txt' # We'll store the tweets in a text file. # If results from a specific ID onwards are reqd, set since_id to that ID. # else default to no lower limit, go as far back as API allows sinceId = None # If results only below a specific ID are, set max_id to that ID. # else default to no upper limit, start from the most recent tweet matching the search query. max_id = -1 tweets = [] tweetCount = 0 print("Downloading max {0} tweets".format(maxTweets)) file_folder = ownModule.createFileFolder() if '\\' in file_folder: file_folder += '\\' else: file_folder += '/' with open(ownModule.removeFileIfExists(file_folder + fName), 'w') as f: while tweetCount < maxTweets: try: if (max_id <= 0): if (not sinceId): new_tweets = api.search(q=hashtag, count=tweetsPerQry) else: new_tweets = api.search(q=hashtag, count=tweetsPerQry, since_id=sinceId) else: if (not sinceId): new_tweets = api.search(q=hashtag, count=tweetsPerQry, max_id=str(max_id - 1)) else: new_tweets = api.search(q=hashtag, count=tweetsPerQry, max_id=str(max_id - 1), since_id=sinceId) if not new_tweets: print("No more tweets found") break for tweet in new_tweets: tweets.append(tweet.text) tweetCount += len(new_tweets) print("Downloaded {0} tweets".format(tweetCount)) max_id = new_tweets[-1].id except tweepy.TweepError as e: # Just exit if any error print("some error : " + str(e)) break return tweets
def get_classifier_NB(training_data): file_folder = ownModule.createFileFolder('classifiers') file_folder += os.path.sep fname = file_folder + "NB_classify" if os.path.isfile(fname): file_fp = open(file_folder + "NB_classify", "rb") classifier = pickle.load(file_fp) file_fp.close() return classifier else: classifier = nltk.NaiveBayesClassifier.train(training_data) file_fp = open(file_folder + "NB_classify", "wb") pickle.dump(classifier, file_fp) file_fp.close() return classifier
def get_classifier_BNB(training_data): file_folder = ownModule.createFileFolder('classifiers') file_folder += os.path.sep fname = file_folder + "BNB_classify" if os.path.isfile(fname): file_fp = open(file_folder + "BNB_classify", "rb") classifier = pickle.load(file_fp) file_fp.close() return classifier else: classifier = SklearnClassifier(BernoulliNB()) classifier.train(training_data) file_fp = open(file_folder + "BNB_classify", "wb") pickle.dump(classifier, file_fp) file_fp.close() return classifier
def main(): hash_tag = "#HDFC" tweets = get_all_hash(hash_tag) file_folder = ownModule.createFileFolder() if '\\' in file_folder: file_folder += '\\' else: file_folder += '/' save_file = open(ownModule.removeFileIfExists(file_folder + "HDFCTweets" + '.txt'), 'a') post_no = 1 for tweet in tweets: print(tweet) save_file.write("POST No: " + str(post_no) + " ") save_file.write(tweet.encode('ascii', 'ignore').decode()) save_file.write("\n") post_no += 1 save_file.close()
def get_GNB(training_data): file_folder = ownModule.createFileFolder('classifiers') file_folder += os.path.sep fname = file_folder + "GNB_classify" # We need trainin data in the form of Numpy array # so the following processing needs to be done Y = [int(i[-1]) for i in training_data] training_data = [list(i) for i in training_data] lim = len(training_data) for i in range(0, lim): training_data[i] = list(training_data[i][0].values()) X = training_data if os.path.isfile(fname): file_fp = open(file_folder + "GNB_classify", "rb") classifier = pickle.load(file_fp) file_fp.close() return classifier else: cf = GaussianNB() cf.fit(X, Y) file_fp = open(file_folder + "GNB_classify", "wb") pickle.dump(cf, file_fp) file_fp.close() return cf
def main(): APP_ID = '416379438547915' APP_SECRET = 'e1c66cb8409855dcf19bf01cb8bdcd6a' bank_name = 'HDFC.bank' graph_url = "https://graph.facebook.com/" + bank_name json_fb_posts = get_all_posts(graph_url, APP_ID, APP_SECRET) post_no = 1 starting_download_time = datetime.datetime.now() # pass the name of the folder you wish to create relative to the project directory to the below function file_folder = ownModule.createFileFolder('outputFiles') file_folder += os.path.sep with open(ownModule.removeFileIfExists(file_folder + bank_name + "FaceBookPosts.csv"), 'w', newline='') as csvfile: spam_writer = csv.writer(csvfile, delimiter='|', quotechar='|', quoting=csv.QUOTE_MINIMAL) spam_writer.writerow(["Post", "Time"]) for post in json_fb_posts: if 'message' in post.keys(): print(post['message']) post['message'] = post['message'].replace("\n", ' ') spam_writer.writerow([post['message'], post['created_time']]) if post_no == 1: starting_download_time = post['created_time'] post_no += 1
__author__ = 'vikas' import csv import os import extract_features as ef import nltk import ownModule from nltk.stem.porter import * from nltk.tokenize import sent_tokenize # Get the stemmer document stemmer = PorterStemmer() file_folder = ownModule.createFileFolder('outputFiles') file_folder += os.path.sep fp = open(file_folder + "First1000.csv", "r", encoding="utf-8") input_reader = csv.reader(fp, delimiter="|") input_reader = [row[0] for row in input_reader] input_reader = input_reader feature_set = [] for i in input_reader: m = sent_tokenize(i) word_list = [] for k in m: words = nltk.tokenize.word_tokenize(k) for word in words: word_list.append(stemmer.stem_word(word.lower())) feature_set.append(word_list) ans_reader = open(file_folder + "labels_of_data.csv", "r") ans_csv = csv.reader(ans_reader, delimiter="|") ans_reader = [row for row in ans_csv]
def main(): stemmer = PorterStemmer() file_folder = ownModule.createFileFolder('outputFiles') file_folder += os.path.sep fp = open(file_folder + "First1000.csv", "r", encoding="utf-8") input_reader = csv.reader(fp, delimiter="|") input_reader = [row[0] for row in input_reader] feature_set = [] all_words = {} stopwords = stw.words('english') # Count the occurence of each word in all of the labelled documents for i in input_reader: m = sent_tokenize(i) for k in m: words = nltk.tokenize.word_tokenize(k) for word in words: if word not in stopwords: try: all_words[stemmer.stem(word.lower())] += 1 except: all_words[stemmer.stem(word.lower())] = 1 file_handle = open(file_folder + "Unlabelled Posts.txt", "r", encoding="utf-8") file_handle = file_handle.readlines()[0:1000] # Count the occurence of each word in all of the unlabelled documents for i in file_handle: temp = i[1:] temp = temp[:-1] m = sent_tokenize(temp) for k in m: words = nltk.tokenize.word_tokenize(k) for word in words: if word not in stopwords: try: all_words[stemmer.stem(word.lower())] += 1 except: all_words[stemmer.stem(word.lower())] = 1 # total number of documents total_docs = len(input_reader) + len(file_handle) # total number of words total_words = len(all_words) feature_set = [] for i in input_reader: m = sent_tokenize(i) for k in m: words = nltk.tokenize.word_tokenize(k) word_list = {} for word in words: if word not in stopwords: try: word_list[stemmer.stem(word.lower())] += 1 except: word_list[stemmer.stem(word.lower())] = 1 # Now for every word in all of the documents temp_features = [] for key, val in all_words.items(): try: temp_features.append(word_list[key] * math.log(total_docs / all_words[key])) except: temp_features.append(0) feature_set.append(temp_features) # Now do it for unlabelled data for i in file_handle: temp = i[1:] temp = temp[:-1] m = sent_tokenize(temp) for k in m: words = nltk.tokenize.word_tokenize(k) word_list = {} for word in words: if word not in stopwords: try: word_list[stemmer.stem(word.lower())] += 1 except: word_list[stemmer.stem(word.lower())] = 1 # Now for every word in all of the documents temp_features = [] for key, val in all_words.items(): try: temp_features.append(word_list[key] * math.log(total_docs / all_words[key])) except: temp_features.append(0) feature_set.append(temp_features) file_handle = [i[1:-1] for i in file_handle] newlist = input_reader + file_handle feature_extraction = CountVectorizer(stop_words='english') x = feature_extraction.fit_transform(newlist).toarray() label_prop_model = LabelPropagation(kernel='knn') # read the labels ans_reader = open(file_folder + "labels_of_data_department.csv", "r") ans_csv = csv.reader(ans_reader, delimiter="|") ans_reader = [int(row[0]) for row in ans_csv] for i in file_handle: ans_reader.append(-1) print(len(ans_reader)) t = label_prop_model.fit(x, ans_reader) accuracy = ownModule.Vevaluate() print("Accuracy of Semi Supervised Learning:", accuracy) return accuracy