예제 #1
0
def run_colfil(inputcID,nodesf,subids):
    inputID=inputcID
    engine = create_engine('mysql+pymysql://X:X@localhost/X?unix_socket=X')
    con = mdb.connect(user= '******', passwd='X', db='X',unix_socket='X',charset='utf8')
    with con:
        cur = con.cursor()
        cur.execute("SELECT distinct cID FROM ChannelInfo")# limit 1000")
        response = cur.fetchall()
        cur.close()
    existIDs = set(np.array(response).flatten())
    print "Database contains info about",len(existIDs),"YT channels"
    #determine for which channels in the target community we are stil missing data
    pullcids=list(set(nodesf)-set(existIDs))
    print "Have to pull Channel info for",len(pullcids),"channels that are listed in the target community"
    counti=0
    for cID in pullcids:
        try:
            counti=counti+1
            df_r=yt.channelinfo(cID)
            #print 'Info pulled for channel ', df_r.cTitle.values,  ': pulled'
            df_r.to_sql('ChannelInfo', engine, if_exists='append', index=False)
        except HttpError:
            print "Can not pull channel",cID,"info, going to create a NotChannel record"
            df_r = pd.DataFrame({ 'cID' : pd.Series(cID),
                                 'DateAdded' : pd.Series('None'),
                                 'cTitle' : pd.Series('NotChannel'),
                                 'cDescr' : pd.Series('None'),
                                 'UploadsID' : pd.Series('None'),
                                 'SubCount': 0,
                                 'ViewCount': 0,
                                 'picUrl' : pd.Series('None'),
                                })
            df_r.to_sql('ChannelInfo', engine, if_exists='append', index=False)
            pass
    #print "pulled channel info for new matches, connect to database and load all channel info"
    con = mdb.connect(user= '******', passwd='X', db='X',unix_socket='X',charset='utf8')
    sqlstr='SELECT cID,cTitle,VideoCount,SubCount,ViewCount,cDescr,UploadsID,picUrl FROM ChannelInfo WHERE cID IN (%s)' 
    in_p = ', '.join(itertools.repeat('%s', len(nodesf)))
    sqlstr = sqlstr % in_p
    with con:
        cur = con.cursor()
        cur.execute(sqlstr, nodesf)
        response = cur.fetchall()
        cur.close()
    print "pulled info about", np.array(response).shape, " channels in target community"
    dfi = pd.DataFrame(columns=('cID','cTitle','VideoCount','SubCount','ViewCount','cDescr','UploadsID','picUrl'))
    counti=0
    for channel in np.array(response):
        if channel[0] in nodesf:
            #print "iter ",counti,"add channel info to dataframe"
            dfi.loc[counti] = channel
            counti=counti+1
    print "added channel info to a dataframe for",counti,"channels"
    dfu=dfi.drop_duplicates()
    print "dropped duplicates"
    if inputcID in list(dfu.cID.values):
        print "inputID survived droping duplicates"
    keywords='beauty|haul|fashion|makeup|shopping|skin|hair|nail|polish|clothes|style|shopaholic'
    dff1=dfu[dfu['cDescr'].str.contains(keywords, case=False)]
    if inputcID in list(dff1.cID.values):
        print "inputID survived keywords filtering"
    #pickle.dump((dff1), open('./static/data/'+inputID+'/'+inputID+'_debug_1.p', 'wb')) #debug
    dff1[['SubCount', 'VideoCount','ViewCount']]=dff1[['SubCount', 'VideoCount','ViewCount']].astype(float)
    dff2=dff1[dff1['VideoCount']>10]
    if inputcID in list(dff2.cID.values):
        print "inputID survived video count filtering"
    dff=dff2[dff2['SubCount']>2000]
    if inputcID in list(dff.cID.values):
        print "inputID survived sub count filtering"
    print "Filters reduced # of matches from", len(dfu.index),"to",len(dff.index)
    plt.figure()
    dff['SubCount'].hist(bins=200, color='mediumpurple')
    plt.xlim([0,5*10**5])
    plt.xlabel('Number of subscribers per channel',fontsize=22)
    plt.ylabel('Frequency',fontsize=22)
    plt.title('Average number of subscribers %dK (std = %dK)' % (dff['SubCount'].mean()/1000, dff['SubCount'].mean()/1000),fontsize=30) 
    plt.savefig('./static/img/'+inputID+'/'+inputID+'_commf_subcount_hist_3.png', dpi=400, format='png')
    #print "create figure of ave num of vids"
    plt.figure()
    dff['VideoCount'].hist(bins=50,color='coral')
    plt.xlim([0,1500])
    plt.xlabel('Number of videos per channel',fontsize=22)
    plt.ylabel('Frequency',fontsize=22)
    plt.title('Average number of videos %d (std = %d)' % (dff['VideoCount'].mean(), dff['VideoCount'].mean()),fontsize=35) 
    plt.savefig('./static/img/'+inputID+'/'+inputID+'_commf_videocount_hist_1.png', dpi=400, format='png')
    viewpv=np.array(dff['ViewCount'].values/dff['VideoCount'].values)

    con = mdb.connect(user= '******', passwd='X', db='X',unix_socket='X',charset='utf8')
    sublists=[]
    sublist_i=[]
    normsub=1
    print "pull subslists_ij"
    for i,cIDi in enumerate(dff['cID']):
        with con:
            cur = con.cursor()
            cur.execute("SELECT distinct cID FROM CommenterSubs WHERE tocID= %s", (cIDi));
            response = cur.fetchall()
            sublist_ii = [node[0] for node in np.array(response)]
            cur.close()
        sublist_i=list(set(sublist_ii) & set(subids))
        sublists.append(np.array(sublist_i))
        if cIDi==inputcID:
            normsub=len(sublist_i)
            indinputID=i
            print "! indinputID is",indinputID
    #print "create zero-ed similarity matrix"
    Sij=np.zeros((len(dff.index),len(dff.index)))
    Sijn=np.zeros((len(dff.index),len(dff.index)))
    for i,cIDi in enumerate(dff['cID']):
        sublist_i = set(sublists[i])
        for j,cIDj in enumerate(dff['cID']):
            if (j>i):
                sublist_j = set(sublists[j])
                counti=len(sublist_i & sublist_j)
                Sij[i][j]= float(counti)
                Sijn[i][j]= float(counti)/normsub
            elif i==j:
                Sij[i][j]=0
                Sijn[i][j]=0
            else:
                Sij[i][j]=Sij[j][i]
                Sijn[i][j]=Sijn[j][i]
    #print "create figure of the similarity matrix"
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.set_aspect('equal')
    plt.imshow(Sijn, interpolation='nearest', cmap=plt.cm.rainbow)
    plt.colorbar()
    plt.savefig('./static/img/'+inputID+'/'+'case1_v2_Sijn_plot_2.png', dpi=300, format='png')#,transparent=True)
    corrmat=np.zeros((len(dff.index),len(dff.index)))
    corrmatw=np.zeros((len(dff.index),len(dff.index)))
    cc2=[]
    #print "about to make corr matrix"
    for i,cIDi in enumerate(dff['cID']):
        cc2.append(np.corrcoef(Sijn[:][indinputID],Sijn[:][i])[0,1])
        for j,cIDj in enumerate(dff['cID']):
            corrmat[i][j]=np.corrcoef(Sijn[:][i],Sijn[:][j])[0,1]
    totsimv=np.zeros(len(dff.index))
    totsim=[]
    for i,cIDi in enumerate(dff['cID']): #compute correlation coef for i channel with the input channel (weight for col. fil.)
            totsimv[i]=sum([corrmat[i][j]*cc2[i] for j,cIDj in enumerate(dff['cID'])])/sum(cc2)
            totsim.append([cIDi, dff['cTitle'].values[i], totsimv[i]])
            corrmatw[:][i]=corrmat[:][i]*cc2
    totsimvn=sorted(totsimv,reverse=True)
    totsimn=sorted(totsim, key=lambda totsim: totsim[2],reverse=True) 
    sort_index = np.argsort(totsimv)
    print "scored everything"
    #collab filter
    dff['score'] = pd.Series(totsimv, index=dff.index)
    dffin=dff.sort(columns='score', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')
    IDpic=dffin[dffin['cID']==inputID]
    dffin = dffin[dffin.cID != inputID]
    final_results=dffin.values.tolist()
    input_info=IDpic.values.tolist()[0]
    print "about to dump results to pickle"
    pickle.dump((final_results,input_info), open('./static/data/'+inputID+'/'+inputID+'_collfilter_list_freqtauto2_comm.p', 'wb'))
예제 #2
0
def collect_data(inputcID):
    engine = create_engine('mysql+pymysql://X:X@localhost/X?unix_socket=X')
    #print "created engine"
    con = mdb.connect(user= '******', passwd='X', db='X',unix_socket='X',charset='utf8')
    #print "mdb connected"
    if not os.path.exists('./static/data/'+inputcID):
        os.makedirs('./static/data/'+inputcID)
    if not os.path.exists('./static/img/'+inputcID):
        os.makedirs('./static/img/'+inputcID)
    with con:
        cur = con.cursor()
        cur.execute("SELECT count(distinct cID) FROM ChannelInfo WHERE cID = %s", (inputcID))
        infofound = int(np.array(cur.fetchall()).flatten())
        cur.close()
    if infofound==0: #retrive channel stats and uploads playlist
        df_r=yt.channelinfo(inputcID)
        df_r.to_sql('ChannelInfo', engine, if_exists='append', index=False)
        uploadsid = str(df_r.UploadsID.values).strip('u[u\'\'])\',')
        print "Pulled Uploads playlist ID:",uploadsid
    else:
        print "Congrats! Input channel info is already in the database"
        with con: 
            cur = con.cursor()
            cur.execute("SELECT distinct UploadsID FROM ChannelInfo WHERE cID = %s", (inputcID))
            uploadsid = str(np.array(cur.fetchall()).flatten()).strip('u[u\'\'])\',')
            cur.close()
    vids=[]
    with con:
        cur = con.cursor()
        cur.execute("SELECT distinct vID FROM VideoInfo WHERE cID = %s", (inputcID))
        vids = np.array(cur.fetchall()).flatten()
        cur.close()
    if len(vids)<30: 
        print"retrieve  uploads info for playlist",uploadsid
        df_v=yt.pull_uploads(uploadsid)
        print "pulled uploads info"
        df_v.to_sql('VideoInfo', engine, if_exists='append', index=False)
        con = mdb.connect(user= '******', passwd='X', db='X',unix_socket='X',charset='utf8')
        with con:
            cur = con.cursor()
            cur.execute("SELECT distinct vID FROM VideoInfo WHERE cID = %s", (inputcID))
            vids = np.array(cur.fetchall()).flatten()
            cur.close()
    else:
        print "Congrats! Video info for the input channel is already in the database"
    cpervid=[]
    with con:
        cur = con.cursor()
        cur.execute("SELECT distinct vID FROM CommentLinks WHERE cID = %s", (inputcID))
        vidsfound = np.array(cur.fetchall()).flatten()
        cur.close()
    if len(vidsfound)<30: #retrive comments for a given video
        vidstopull = set(vids) - set(vidsfound)
        for vID in list(vidstopull):
            print "Have to pull comments for vID",vID
            try:
                [nextp,rperpage,totr,ccount,df_c]=yt.pull_comments(vID,-1)
                df_c.to_sql('CommentLinks', engine, if_exists='append', index=False)
                reqtot = 1
                ccount_i=ccount
                while nextp!='None' or reqtot<4: #load no more than 3 pages to most recent comments
                    [nextp,rperpage,totr,ccount,df_c]=yt.pull_comments(vID,nextp)
                    ccount_i=ccount_i+ccount
                    reqtot = reqtot + 1     
                    df_c.to_sql('CommentLinks', engine, if_exists='append', index=False)
                cpervid.append(ccount_i)
            except:
                df_c = pd.DataFrame({ 'sID' : pd.Series('NotShared'),
                                     'cID' : pd.Series(inputcID),
                                     'vID' : pd.Series(vID)
                                    })
                df_c.to_sql('CommentLinks', engine, if_exists='append', index=False)
                pass
    else:
        print "Congrats! Comment info for recent 30 vids for the channel cID", inputcID, "is already in the database"
    counti=0
    df10 = pd.DataFrame(columns=('sID','vID'))
    subids=[]
    con = mdb.connect(user= '******', passwd='X', db='X',unix_socket='X',charset='utf8')
    with con:
        cur = con.cursor()
        cur.execute("SELECT sID,vID FROM CommentLinks WHERE cID= %s AND sID<>'NotShared'",inputcID)
        subids = np.array(cur.fetchall())
        cur.close()
    for info_i in subids:
        df10.loc[counti] = info_i
        counti=counti+1
    df10=df10.drop_duplicates()
    df10['sIDfreq']=df10.groupby('sID').transform('count') #add a column that shows the number of time this commenter sID appeared in the comment threads
    sidnumt=[]
    freqt=0
    freqset=0
    for freqt_i in np.arange(10):
        df11=df10[df10['sIDfreq']>=freqt_i]
        subidsf=len(df11.sID.drop_duplicates())
        sidnumt.append(subidsf)
        #print "Pulling only sIDs with freq >",freqt_i," reduces # of records from",len(df10.sID.drop_duplicates()),"to",subidsf
        if freqset==0:
            if float(sidnumt[freqt_i])/sidnumt[0]<=0.03 or subidsf<=45:
                freqset=1
                freqt=freqt_i
                df12=df10[df10['sIDfreq']>=freqt]
            print "Pulling only sIDs with freq >",freqt_i," reduces # of records from",sidnumt[0],"to",sidnumt[freqt_i],"ratio is",float(sidnumt[freqt_i])/sidnumt[0]
    plt.figure()
    h1=df10.sIDfreq.hist(color='mediumaquamarine')
    h2=plt.axvline(df10.sIDfreq.mean(), 0, 1,color='navy',linewidth=10)
    h3=plt.axvline(freqt, 0, 1,color='coral',linewidth=10)
    plt.title('Viewer engagement (comments per user)',fontsize=26)
    plt.xlabel("Number of comments per user",fontsize=30)
    plt.ylabel('Frequency',fontsize=30)
    plt.legend((h2,h3),('average engagement = '+str(round(df10.sIDfreq.mean(),2)),'engagement threshold = '+str(freqt)),fontsize=22)
    plt.savefig('./static/img/'+inputcID+'/'+inputcID+'_commenter_ID_freq_hist.png', dpi=300, format='png')

    plt.figure()
    h1=plt.plot(sidnumt,'o-',linewidth=10.0,color='steelblue',markersize=20)
    h2=plt.axvline(freqt,0,1, color='coral',linewidth=15)
    plt.legend((h1,h2),('','engagement threshold'),fontsize=22)
    plt.xlabel('Number of comments per user',fontsize=30)
    plt.ylabel("Thresholded user count",fontsize=30)
    plt.savefig('./static/img/'+inputcID+'/'+inputcID+"_commenterIds_thresholding.png", dpi=300, format='png')
    
    subids=set(df12.sID.values)
    print "After dropping duplicates and leaving only sIDs of freqt =", freqt," reduced # of records from ", sidnumt[0], "to", len(subids)
    nopull=0;
    subnumber=[]
    res_pulled=0;
    count_i=0
    sidstopull=[]
    numdownloads=1
    res_pulled=0
    con = mdb.connect(user= '******', passwd='X', db='X',unix_socket='X',charset='utf8')
    with con: #get all sIDs for which SubsfromComments have been already pulled from YT
        cur = con.cursor()
        cur.execute("SELECT distinct cID FROM CommenterSubs WHERE inputID=%s", (inputcID))
        sIDexist = np.array(cur.fetchall()).flatten()
        cur.close()
    if (len(subids) - len(sIDexist))>0: #pullagain==1:
        sidstopull= set(subids) - set(sIDexist)
        print "Have subscription info for", len(sIDexist)," gotta pull subscriptions for ", len(sidstopull),"more"
        for sID in sidstopull:
            print "Gotta pull subscriptions for sID", sID
            try:
                [sublist,subdates,nextp,rperpage,totr,resread]=yt.pull_subscriptions(sID,-1)
                subnumber.append(int(totr))
                res_pulled = resread+res_pulled
                sIDs = [sID for s in subdates]
                inputcIDs=[inputcID for s in subdates]
                reqtot = 1
                numdownloads=math.ceil(float(totr)/rperpage)
                dfi = pd.DataFrame({ 'inputID': pd.Series(inputcIDs),
                                    'DateAdded' : pd.Series(subdates),
                                    'cID' : pd.Series(sIDs),
                                    'tocID' : pd.Series(sublist),    
                                   })     
                dfi.to_sql('CommenterSubs', engine, if_exists='append', index=False)
                while reqtot < numdownloads:
                    [sublist,subdates,nextp,rperpage,totr,resread]=yt.pull_subscriptions(sID,nextp)
                    res_pulled = res_pulled + resread
                    reqtot = reqtot + 1
                    dfi = pd.DataFrame({ 'inputID': pd.Series(inputcIDs),
                                        'DateAdded' : pd.Series(subdates),
                                        'cID' : pd.Series(sIDs),
                                        'tocID' : pd.Series(sublist),    
                                       })     
                    dfi.to_sql('CommenterSubs', engine, if_exists='append', index=False)
            except:
                dfi = pd.DataFrame({'inputID': inputcID,
                                    'DateAdded' : "None",
                                    'cID' : sID,
                                    'tocID' : pd.Series("NotShared"),
                                   })
                dfi.to_sql('CommenterSubs', engine, if_exists='append', index=False)
                #print "Channel ", sID, 'does not share its subscriptions'
                nopull=nopull+1
                pass
        #plt.figure()
        #plt.hist(subnumber,color='sandybrown')
        #plt.title('Average number of subscriptions per user: %.d (std = %.d)' % (int(np.average(subnumber)), int(np.std(subnumber))),fontsize=30) #%08.2f (std = %.2f)
        #plt.xlabel('Number of channels',fontsize=22)
        #plt.ylabel('Frequency (normalized)',fontsize=22)
        #plt.savefig('./static/img/'+inputcID+'/'+inputcID+'_numsub_per_user.png', dpi=300, format='png')#,transparent=True)
    else:
        print "Congrats! Subscriptions for all Commenters are already in the database!"
    return subids