def getfeaturesuser(): dbconnect.connect() mylist = dbconnect.getdeminfo() demdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score']) demdf["leaning"] = 1 mylist = dbconnect.getrepinfo() repdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score']) repdf["leaning"] = 0 frames = [demdf, repdf] df = pd.concat(frames) df = df.drop_duplicates() authorlist = df.author.unique() subredditlist = df.subreddit.unique() subredditlist = ['leaning'] + list(subredditlist) finallist = pd.DataFrame(index=list(authorlist), columns=subredditlist) finallist = finallist.fillna(0) subscriberlist = dbconnect.getsubscribercount() # print(subscriberlist) for row in tqdm(df.iterrows()): #each row is a tuple (index num, series) currentauthor = str(row[1]['author']) currentsubreddit = str(row[1]['subreddit']) currentleaning = str(row[1]['leaning']) currentscore = row[1]['score'] currentsubscriber = subscriberlist[currentsubreddit] # print(currentsubscriber) if currentsubscriber != 0: try: finallist.loc[currentauthor, currentsubreddit] += (float(currentscore) / currentsubscriber) * 1000 except: print(currentsubreddit, currentsubscriber) #finallist.loc[currentauthor, 'author'] = currentauthor finallist.loc[currentauthor, 'leaning'] = currentleaning # print(finallist) finallist.reset_index(drop=True, inplace=True) finallist = finallist.sample(frac=1) finallist.reset_index(drop=True, inplace=True) # print(finallist['politics']) # print(finallist) finallist = finallist.drop(columns=['democrats', 'Republican']) deletedlist = dbconnect.getdeletedsubreddits() for delsub in deletedlist: try: finallist = finallist.drop(columns=delsub) except: continue dbconnect.disconnect() return finallist
def getfinallist(): dbconnect.connect() mylist = dbconnect.getdeminfo() demdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score']) demdf["leaning"] = "dem" mylist = dbconnect.getrepinfo() repdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score']) repdf["leaning"] = "rep" frames = [demdf, repdf] df = pd.concat(frames) df = df.drop_duplicates() authorlist = df.author.unique() subredditlist = df.subreddit.unique() subredditlist = ['leaning'] + list(subredditlist) finallist = pd.DataFrame(index=list(authorlist), columns=subredditlist) finallist = finallist.fillna(0) for row in tqdm(df.iterrows()): #each row is a tuple (index num, series) currentauthor = str(row[1]['author']) currentsubreddit = str(row[1]['subreddit']) currentleaning = str(row[1]['leaning']) currentscore = row[1]['score'] #print(currentauthor) #print(currentsubreddit) finallist.loc[currentauthor, currentsubreddit] += float(currentscore) #finallist.loc[currentauthor, 'author'] = currentauthor finallist.loc[currentauthor, 'leaning'] = currentleaning for column in finallist: if column == 'leaning': continue max = finallist[column].max() if max == 0: continue max = float(max) mylist = finallist[column].astype('float') finallist[column] = mylist.divide(other=max).round(3) finallist.reset_index(drop=True, inplace=True) finallist = finallist.sample(frac=1) finallist.reset_index(drop=True, inplace=True) # print(finallist['politics']) # print(finallist) finallist = finallist.drop(columns=['democrats', 'Republican']) print(finallist) finallist.to_pickle("../activitydata2020.pkl") dbconnect.disconnect()
def getfeaturesmax(): dbconnect.connect() mylist = dbconnect.getdeminfo() demdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score']) demdf["leaning"] = 0 mylist = dbconnect.getrepinfo() repdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score']) repdf["leaning"] = 1 frames = [demdf, repdf] df = pd.concat(frames) df = df.drop_duplicates() authorlist = df.author.unique() subredditlist = df.subreddit.unique() subredditlist = ['leaning'] + list(subredditlist) finallist = pd.DataFrame(index=list(authorlist), columns=subredditlist) finallist = finallist.fillna(0) for row in tqdm(df.iterrows()): #each row is a tuple (index num, series) currentauthor = str(row[1]['author']) currentsubreddit = str(row[1]['subreddit']) currentleaning = str(row[1]['leaning']) currentscore = row[1]['score'] #print(currentauthor) #print(currentsubreddit) finallist.loc[currentauthor, currentsubreddit] += float(currentscore) #finallist.loc[currentauthor, 'author'] = currentauthor finallist.loc[currentauthor, 'leaning'] = currentleaning demlist = finallist[(finallist.leaning == '0')] #print("dem author count: ",len(demlist.index)) replist = finallist[(finallist.leaning == '1')] #print("rep author count: ",len(replist.index)) #print("attempting to balance so that dem/rep have same amount of author...") finallist = pd.concat([ replist.head(min(len(replist.index), len(demlist.index))), demlist.head(min(len(replist.index), len(demlist.index))) ]) demlist = finallist[(finallist.leaning == '0')] #print("dem author count: ",len(demlist.index)) replist = finallist[(finallist.leaning == '1')] #print("rep author count: ",len(replist.index)) #print(finallist) for column in finallist: if column == 'leaning': continue max = finallist[column].max() if max == 0: continue max = float(max) mylist = finallist[column].astype('float') finallist[column] = mylist.divide(other=max).round(3) finallist.reset_index(drop=True, inplace=True) finallist = finallist.sample(frac=1) finallist.reset_index(drop=True, inplace=True) # print(finallist['politics']) # print(finallist) finallist = finallist.drop(columns=['democrats', 'Republican']) dbconnect.disconnect() return finallist
def main(): dbconnect.connect() mylist = dbconnect.getdeminfo() joined_string = dict() for num, i in enumerate(mylist): joined_string[num] = "".join(i) word = dict() dic = dict() for num, line in enumerate(joined_string.values()): word[num] = line for num, line in enumerate(word.values()): word[num] = line.replace("\n",'') for num, line in enumerate(word.values()): word[num] = line.replace(",",'') for num, line in enumerate(word.values()): word[num] = line.replace(".",'') for num, line in enumerate(word.values()): word[num] = line.replace("!",'') for num, line in enumerate(word.values()): word[num] = line.replace("?",'') for num, line in enumerate(word.values()): word[num] = line.replace("[removed]",'') for num, line in enumerate(word.values()): word[num] = line.replace("[",'') for num, line in enumerate(word.values()): word[num] = line.replace("]",'') for num, line in enumerate(word.values()): word[num] = line.replace('"','') for num, line in enumerate(word.values()): word[num] = line.split() tags = set(['NN','NNS','NNP','NNPS','JJ','JJR','JJS']) #remain nonus and adjective for num, line in enumerate(word.values()): pos_tags =nltk.pos_tag(line) word[num] =[word for word,pos in pos_tags if pos in tags] for item in word.values(): word_low = [s.lower() for s in item] for i in word_low: if i in dic: dic[i] += 1 else: dic[i] = 1 dic = sorted(dic.items(),key = lambda items:items[1],reverse = True) dic = dic[:100] #top100 dictdata = dict() for l in dic: dictdata[l[0]] = l[1] print(dictdata)