def get_news_tags(): text_data = [] r = read_hdf5(files["news_store"], "reddit") t = read_hdf5(files["news_store"], "twitter_conversations") n = read_hdf5(files["news_store"], "news_articles") text_data.extend(r.text.values) text_data.extend(r.title.values) text_data.extend(t.text.values) text_data.extend(n.text.values) text_data.extend(n.title.values) st = time.process_time() all_ = {} for text in text_data: tags = tag_str(text) for tag in tags: if tag in all_.keys(): all_[tag] += 1 else: all_[tag] = 1 pprint(all_) print(f"Took {time.process_time() - st} sek to complete") df_tags = pd.DataFrame.from_dict(orient="index", columns=["mentions"], data=all_) print(df_tags.sort_values(by="mentions", ascending=False, inplace=True)) df_tags.to_csv("facts/tags_gotten2.csv")
def check_words(): #twitter = read_hdf5(files["news_store"], "twitter_conversations") reddit = read_hdf5(files["news_store"], "reddit") words = [] for index, row in reddit.iterrows(): score = row["score"] title = row["title"] text = row["text"] all_text = title + " " + text urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', all_text) for char in special_chars_str: all_text = all_text.replace(char, "") all_text_tokenized = tokenize.word_tokenize(all_text) corpus.wordnet.words() corp = set(corpus.words.words() + corpus.brown.words()) english_vocab = set(w.lower() for w in corp) sample_lower = set(w.lower() for w in all_text_tokenized if w.lower().isalpha()) unusual = sample_lower.difference(english_vocab) words += unusual votes, percent = score.split(":") print(votes, all_text, "______", urls) print("______________________________") print(words) counted = collections.Counter(words) print(counted) with open(files["reddit_comments"], "br") as f: d = pkl.load(f) pprint(d) articles = read_hdf5(files["news_store"], "news_articles") print(reddit, articles)
def __init__(self): #del_hdf5_key(files["news_store"], "reddit") try: if len(read_hdf5(files["news_store"], "reddit")) <= 1: error("Too short new") self.df_subs = pd.DataFrame(columns=[ "id", "text", "title", "created", "score", "comments", "comment_num" ]) else: self.old_save = True self.df_subs = read_hdf5(files["news_store"], "reddit") except KeyError as e: error(e) self.df_subs = pd.DataFrame(columns=[ "id", "text", "title", "created", "score", "comments", "comment_num" ]) try: self.ids = list(read_hdf5(files["news_store"], "reddit").id.values) except KeyError as e: error(e) self.ids = list() user_agent = "Windows: News Analyser :v1.0 (by /u/ludvig127)" self.red = praw.Reddit( user_agent=user_agent, client_id=read_yaml(files["auth"])["reddit_client_id"], client_secret=read_yaml(files["auth"])["reddit_client_secret"])
def __init__(self): user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " \ "(KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36 OPR/66.0.3515.44" self.config = Config() self.config.browser_user_agent = user_agent self.config.memoize_articles = True self.config.verbose = True self.config.language = "en" ignore_already_gotten = True #nono_host_words = ["arabic", "espanol", "latino", "latina"] self.df_art_base = pd.DataFrame( columns=["link", "text", "title", "created", "keywords", "author"]) try: self.newssites = ([ "https://www.cnbc.com", 'http://cnn.com', "http://www.huffingtonpost.com", "http://www.nytimes.com", "http://news.bbc.co.uk/", "http://www.theguardian.com/" ]) except Exception as e: error(str(e)) self.urls = {} try: if len(read_hdf5(files["news_store"], "news_articles")) <= 1: error("Too short new") del_hdf5_key(["news_store"], "news_articles") self.df_art = pd.DataFrame(columns=[ "link", "text", "title", "created", "keywords", "author" ]) else: self.df_art = read_hdf5(files["news_store"], "news_articles") except KeyError as e: error(e) self.df_art = pd.DataFrame(columns=[ "link", "text", "title", "created", "keywords", "author" ]) except TypeError: error("Too short new") del_hdf5_key(files["news_store"], "news_articles") self.df_art = pd.DataFrame(columns=[ "link", "text", "title", "created", "keywords", "author" ]) try: self.urls["gotten"] = read_hdf5(files["news_store"], "news_articles").link.values except (KeyError, AttributeError) as e: error(e) self.urls["gotten"] = []
def __init__(self): twitter_consumer_key = read_yaml(files["auth"])["twitter_consumer_key"] twitter_secret_consumer_key = read_yaml( files["auth"])["twitter_secret_consumer_key"] twitter_access_token_key = read_yaml( files["auth"])["twitter_access_token_key"] twitter_secret_access_token_key = read_yaml( files["auth"])["twitter_secret_access_token_key"] auth = tweepy.OAuthHandler(twitter_consumer_key, twitter_secret_consumer_key) auth.set_access_token(twitter_access_token_key, twitter_secret_access_token_key) self.ids = [] self.store_dir = files["news_store"] self.api = tweepy.API(auth) try: self.api.verify_credentials() log("Authentication OK") except: error("Error during authentication") log("Error during authentication") try: data = read_hdf5(files["news_store"], key="twitter_conversations") ids = [int(i) for i in data.id.values] since_id = max(ids) except Exception as e: since_id = 1252311068950638593 error(e) print(since_id) self.get_conversations(since_id=since_id)
def tags_history(): """Create dataframe of tags with their dates formated with the tag as index and a dict with date(hour) and number""" dold = read_hdf5(files["news_store"], "news_articles") d = read_hdf5(files["news_store"], "news_articles").set_index(keys="created") news_created = (d.drop("None").dropna().sort_values(by="created")) dold["created"] = dold["created"].apply(date_to_posix) print(dold.dropna().set_index(keys="created").sort_values(by="created")) sleep(10000) dates = news_created.index.values print(dates, type(dates)) stamps = [ time.mktime(ciso8601.parse_datetime(t).timetuple()) for t in dates if type(t) is type("") ] print(stamps) for date, stamp in zip(dates, stamps): print(f"{date} ; {stamp}") sleep(10000) twe = read_hdf5(files["news_store"], "twitter_conversations") red = read_hdf5(files["news_store"], "reddit").set_index(keys="created") print("articles", d) print("twitter", twe) print("reddit", red) texts = 0 all = {} text_list = [] [text_list.append(t) for t in d.text.values] [text_list.append(t) for t in twe.text.values] [text_list.append(t) for t in red.selftext.values] for p in text_list: texts += 1 print(p) tags = find_tags_from_str(p, as_str=False) print("TAGS: ", tags) for tag in tags: if tag in all.keys(): all[tag] += 1 else: all[tag] = 1 # from finnhub import Stream # Stream() pprint(all)
def make_all_timestamp(): dold = read_hdf5(files["news_store"], "twitter_conversations").reset_index() #dold["created_date"] = dold["created_date"].apply(date_to_posix) dold = dold.dropna().sort_values(by="created_date") print(dold) print(dold.loc[dold.index.values[0]]) update_hdf5(files["news_store"], "twitter_conversations", dataframe=dold.reset_index(), append=False)
def to_backup(file, key, append=True): backup_df = read_hdf5(file, key) print(backup_df) update_hdf5(files["backup"], key, append=append, dataframe=backup_df)
def dicts_to_df(dicts): columns_list = ["tags_combined", "time_start", "time_stop"] [columns_list.extend(t) for t in return_tags()] df = pd.DataFrame(columns=columns_list) for dict in dicts: print(dict) df = df.append(dict, ignore_index=True).fillna(0) print(df) return df if __name__ == "__main__": sleep(1000) get_news_tags() tag = read_hdf5(files["tags_df"], "twitter").AMD.values comb = read_hdf5(files["tags_df"], "twitter").tags_combined.values for t, c in zip(tag, comb): print(t / c) sleep(400) pprint(return_tags()) dfs = split_hourly(read_hdf5(files["news_store"], "twitter_conversations")) tags_dicts = [] for df in dfs: d = tags_from_df(df, title=False) tags_dicts.append(d) df_interval = dicts_to_df(tags_dicts) update_hdf5(files["tags_df"], "twitter",
def get_conversations(self, since_id=0, tags=(), items_batch=25): if not tags: tags = [t for t in twitter_tags() if len(t) > 1] log(f"Getting raw data for {tags}, since_id={since_id}") search_str = "" try: data = read_hdf5(files["news_store"], key="twitter_conversations") ids = [int(i) for i in data.id.values] self.ids.extend(ids) since_id = max(ids) except Exception as e: since_id = 1252311068950638593 error(e) if not since_id: try: ids = read_hdf5(self.store_dir, "twitter_conversations")["id"].values ids_ = True except (AttributeError, KeyError, TypeError) as e: error(str(e)) ids_ = False if not ids_: since_id = 0 else: since_id = max(ids) df = pd.DataFrame(columns=[ "created", "id", "retweets", "text", "user_id", "favorits", "user_followers" ]) tags = [tag for tag in tags if type(tag) is type(str())] for t in tags[:40]: search_str += f"{t} OR " if search_str[-3:] == "OR ": search_str = search_str[:-4] search_str += " -filter:retweets" print(search_str) for i in range(5): try: since_id = max(self.ids) if since_id: cursor = tweepy.Cursor(self.api.search, q=search_str, lang="en", full_text=True, since_id=since_id, wait_on_rate_limit=True, tweet_mode="extended", wait_on_rate_limit_notify=True, retry_delay=5).items(items_batch) else: cursor = tweepy.Cursor(self.api.search, q=search_str, lang="en", full_text=True, wait_on_rate_limit=True, tweet_mode="extended", wait_on_rate_limit_notify=True, retry_delay=5).items(items_batch) for tweet in cursor: j = tweet._json if j["id"] in self.ids: print(str(j["id"]) + " already in wtf") else: created = date_to_posix(str(j["created_at"]), list=False) if created is not None: #print(j["created_at"]) #date = datetime.datetime.strptime(j["created_at"], "%a %b %H %M %S %z %-y").replace(tzinfo=timezone.utc).timestamp() #print(date) data = { "created": float(created), "id": str(j["id"]), "retweets": str(j["retweet_count"]), "text": str(j["full_text"]), "user_id": str(j["user"]["id"]), "favorits": str(j["favorite_count"]), "user_followers": str(j["user"]["followers_count"]) } #tag_str(j["full_text"], as_str=True) self.ids.append(int(j["id"])) if len(data["text"]) >= 333: print(data["text"], "left out") else: if len(data["text"]) > 25: df = df.append(data, ignore_index=True) else: print(j) df.set_index("created", inplace=True) print(df) self.ids.extend([int(v) for v in df.id.values]) update_hdf5(files["news_store"], key="twitter_conversations", dataframe=df, append=True) df = pd.DataFrame(columns=[ "created", "id", "retweets", "text", "user_id", "favorits", "user_followers" ]) except FileNotFoundError as e: error(str(e))
#print(d) #update_hdf5("news_data_new.h5", "news_articles", dataframe=d, append=False) #print(read_hdf5(["news_store"], "twitter_conversations")) if __name__ == "__main__": loop = True runs = 0 while loop: #print(read_hdf5(files["news_store"], "twitter_conversations").columns) #update_hdf5(files["news_store"], "twitter_conversations", read_hdf5(files["news_store"], "twitter_conversations").drop("tag", inplace=False, axis=1), append=False) #update_hdf5(files["news_store"], "twitter_conversations", read_hdf5(files["news_store"], "twitter_conversations").set_index("created", inplace=False), append=False) print(read_hdf5(files["news_store"], "twitter_conversations")) runs += 1 TwitterNews() #print(read_hdf5(files["news_store"], "twitter_conversations").tag) try: pass Reddit.scrape_subreddit(Reddit()) except Exception as e: error(e) pass #del_hdf5_key(files["news_store"], "news_articles") NewsArticles.gather_different(NewsArticles()) print(read_hdf5(files["news_store"], "twitter_conversations")) print(read_hdf5(files["news_store"], "news_articles")) print(read_hdf5(files["news_store"], "reddit"))
from __init__ import read_hdf5, files, find_tags_from_str from pprint import pprint import pandas as pd from time import sleep reddit_df = read_hdf5(files["news_store"], "reddit") print(reddit_df) print(reddit_df.columns) print(reddit_df.selftext.values) for t in reddit_df.title.values: if len(t) > 15: print(t) print("tags; ", find_tags_from_str(t, as_str=True))