def getfeaturesuser():
    dbconnect.connect()
    mylist = dbconnect.getdeminfo()
    demdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score'])
    demdf["leaning"] = 1

    mylist = dbconnect.getrepinfo()
    repdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score'])
    repdf["leaning"] = 0

    frames = [demdf, repdf]
    df = pd.concat(frames)
    df = df.drop_duplicates()

    authorlist = df.author.unique()
    subredditlist = df.subreddit.unique()

    subredditlist = ['leaning'] + list(subredditlist)

    finallist = pd.DataFrame(index=list(authorlist), columns=subredditlist)
    finallist = finallist.fillna(0)

    subscriberlist = dbconnect.getsubscribercount()
    #    print(subscriberlist)
    for row in tqdm(df.iterrows()):  #each row is a tuple (index num, series)
        currentauthor = str(row[1]['author'])
        currentsubreddit = str(row[1]['subreddit'])
        currentleaning = str(row[1]['leaning'])
        currentscore = row[1]['score']
        currentsubscriber = subscriberlist[currentsubreddit]
        #        print(currentsubscriber)
        if currentsubscriber != 0:
            try:
                finallist.loc[currentauthor,
                              currentsubreddit] += (float(currentscore) /
                                                    currentsubscriber) * 1000
            except:
                print(currentsubreddit, currentsubscriber)
        #finallist.loc[currentauthor, 'author'] = currentauthor
        finallist.loc[currentauthor, 'leaning'] = currentleaning

#    print(finallist)
    finallist.reset_index(drop=True, inplace=True)
    finallist = finallist.sample(frac=1)
    finallist.reset_index(drop=True, inplace=True)
    #    print(finallist['politics'])
    #    print(finallist)
    finallist = finallist.drop(columns=['democrats', 'Republican'])
    deletedlist = dbconnect.getdeletedsubreddits()
    for delsub in deletedlist:
        try:
            finallist = finallist.drop(columns=delsub)
        except:
            continue

    dbconnect.disconnect()
    return finallist
예제 #2
0
def getfinallist():
    dbconnect.connect()
    mylist = dbconnect.getdeminfo()
    demdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score'])
    demdf["leaning"] = "dem"

    mylist = dbconnect.getrepinfo()
    repdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score'])
    repdf["leaning"] = "rep"

    frames = [demdf, repdf]
    df = pd.concat(frames)
    df = df.drop_duplicates()

    authorlist = df.author.unique()
    subredditlist = df.subreddit.unique()

    subredditlist = ['leaning'] + list(subredditlist)

    finallist = pd.DataFrame(index=list(authorlist), columns=subredditlist)
    finallist = finallist.fillna(0)

    for row in tqdm(df.iterrows()):  #each row is a tuple (index num, series)
        currentauthor = str(row[1]['author'])
        currentsubreddit = str(row[1]['subreddit'])
        currentleaning = str(row[1]['leaning'])
        currentscore = row[1]['score']
        #print(currentauthor)
        #print(currentsubreddit)
        finallist.loc[currentauthor, currentsubreddit] += float(currentscore)
        #finallist.loc[currentauthor, 'author'] = currentauthor
        finallist.loc[currentauthor, 'leaning'] = currentleaning

    for column in finallist:
        if column == 'leaning':
            continue
        max = finallist[column].max()
        if max == 0:
            continue
        max = float(max)
        mylist = finallist[column].astype('float')
        finallist[column] = mylist.divide(other=max).round(3)

    finallist.reset_index(drop=True, inplace=True)
    finallist = finallist.sample(frac=1)
    finallist.reset_index(drop=True, inplace=True)
    #    print(finallist['politics'])
    #    print(finallist)
    finallist = finallist.drop(columns=['democrats', 'Republican'])
    print(finallist)
    finallist.to_pickle("../activitydata2020.pkl")
    dbconnect.disconnect()
def getfeaturesmax():
    dbconnect.connect()
    mylist = dbconnect.getdeminfo()
    demdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score'])
    demdf["leaning"] = 0

    mylist = dbconnect.getrepinfo()
    repdf = pd.DataFrame(mylist, columns=['author', 'subreddit', 'score'])
    repdf["leaning"] = 1

    frames = [demdf, repdf]
    df = pd.concat(frames)
    df = df.drop_duplicates()

    authorlist = df.author.unique()
    subredditlist = df.subreddit.unique()

    subredditlist = ['leaning'] + list(subredditlist)

    finallist = pd.DataFrame(index=list(authorlist), columns=subredditlist)
    finallist = finallist.fillna(0)

    for row in tqdm(df.iterrows()):  #each row is a tuple (index num, series)
        currentauthor = str(row[1]['author'])
        currentsubreddit = str(row[1]['subreddit'])
        currentleaning = str(row[1]['leaning'])
        currentscore = row[1]['score']
        #print(currentauthor)
        #print(currentsubreddit)
        finallist.loc[currentauthor, currentsubreddit] += float(currentscore)
        #finallist.loc[currentauthor, 'author'] = currentauthor
        finallist.loc[currentauthor, 'leaning'] = currentleaning

    demlist = finallist[(finallist.leaning == '0')]
    #print("dem author count: ",len(demlist.index))
    replist = finallist[(finallist.leaning == '1')]
    #print("rep author count: ",len(replist.index))
    #print("attempting to balance so that dem/rep have same amount of author...")
    finallist = pd.concat([
        replist.head(min(len(replist.index), len(demlist.index))),
        demlist.head(min(len(replist.index), len(demlist.index)))
    ])
    demlist = finallist[(finallist.leaning == '0')]
    #print("dem author count: ",len(demlist.index))
    replist = finallist[(finallist.leaning == '1')]
    #print("rep author count: ",len(replist.index))

    #print(finallist)

    for column in finallist:
        if column == 'leaning':
            continue
        max = finallist[column].max()
        if max == 0:
            continue
        max = float(max)
        mylist = finallist[column].astype('float')
        finallist[column] = mylist.divide(other=max).round(3)

    finallist.reset_index(drop=True, inplace=True)
    finallist = finallist.sample(frac=1)
    finallist.reset_index(drop=True, inplace=True)
    #    print(finallist['politics'])
    #    print(finallist)
    finallist = finallist.drop(columns=['democrats', 'Republican'])
    dbconnect.disconnect()
    return finallist
예제 #4
0
def main():
    dbconnect.connect()
    mylist = dbconnect.getdeminfo()
    joined_string = dict()
    for num, i in enumerate(mylist):  
        joined_string[num] = "".join(i)  

    word = dict()
    dic = dict() 

    for num, line in enumerate(joined_string.values()):
        word[num] = line

    for num, line in enumerate(word.values()):
        word[num] = line.replace("\n",'')

    for num, line in enumerate(word.values()):
        word[num] = line.replace(",",'')
        
    for num, line in enumerate(word.values()):
        word[num] = line.replace(".",'')
    
    for num, line in enumerate(word.values()):
        word[num] = line.replace("!",'')

     for num, line in enumerate(word.values()):
        word[num] = line.replace("?",'')

    for num, line in enumerate(word.values()):
        word[num] = line.replace("[removed]",'')

    for num, line in enumerate(word.values()):
        word[num] = line.replace("[",'')

    for num, line in enumerate(word.values()):
        word[num] = line.replace("]",'')
    
    for num, line in enumerate(word.values()):
        word[num] = line.replace('"','')
       

    for num, line in enumerate(word.values()):
	    word[num] = line.split() 
    tags = set(['NN','NNS','NNP','NNPS','JJ','JJR','JJS']) #remain nonus and adjective
    for num, line in enumerate(word.values()):
        pos_tags =nltk.pos_tag(line)
        word[num] =[word for word,pos in pos_tags if pos in tags] 
    

    for item in word.values():   
        word_low = [s.lower() for s in item]
        for i in word_low:
            if i in dic:
                dic[i] += 1
            else:
                dic[i] = 1
    
    dic = sorted(dic.items(),key = lambda items:items[1],reverse = True)
    dic = dic[:100] #top100
    dictdata = dict()
    for l in dic:
        dictdata[l[0]] = l[1]
    print(dictdata)