def compore_distribution(field, feds, randoms, youngs): # print '---------------Compare ' + field + '---------------------' edcomm = statis_util.comm_stat(feds) rdcomm = statis_util.comm_stat(randoms) ygcomm = statis_util.comm_stat(youngs) ed_rdz = statis_util.ks_test(randoms, feds) ed_ygz = statis_util.ks_test(youngs, feds) yg_rdz = statis_util.ks_test(youngs, randoms) # if min(ed_rdz[2], ed_ygz[2])>yg_rdz[2]: print '%s & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f%s & %.2f%s & %.2f%s \\\\' \ % (field, edcomm[2], edcomm[3], rdcomm[2], rdcomm[3], ygcomm[2], ygcomm[3], ed_rdz[2], pvalue(ed_rdz[3]), ed_ygz[2], pvalue(ed_ygz[3]), yg_rdz[2], pvalue(yg_rdz[3])) # print 'ED & ' + str(edcomm[0]) + ' & ' + str(edcomm[1]) \ # + ' & ' + str(edcomm[2]) + ' & ' + str(edcomm[3]) + '\\\\' # print 'Random &' + str(rdcomm[0]) + ' & ' + str(rdcomm[1]) \ # + ' & ' + str(rdcomm[2]) + ' & ' + str(rdcomm[3]) + '\\\\' # print 'Younger &' + str(ygcomm[0]) + ' & ' + str(ygcomm[1]) \ # + ' & ' + str(ygcomm[2]) + ' & ' + str(ygcomm[3]) + '\\\\' # print '\\hline' # print 'ks-test(Random, ED): & $n_1$: ' + str(ed_rdz[0]) + ' & $n_2$: ' + str(ed_rdz[1]) \ # + ' & ks-value: ' + str(ed_rdz[2]) + ' & p-value: ' + str(ed_rdz[3]) + '\\\\' # print 'ks-test(Younger, ED): & $n_1$: ' + str(ed_ygz[0]) + ' & $n_2$: ' + str(ed_ygz[1]) \ # + ' & ks-value: ' + str(ed_ygz[2]) + ' & p-value: ' + str(ed_ygz[3]) + '\\\\' # print 'ks-test(Younger, Random): & $n_1$: ' + str(yg_rdz[0]) + ' & $n_2$: ' + str(yg_rdz[1]) \ # + ' & ks-value: ' + str(yg_rdz[2]) + ' & p-value: ' + str(yg_rdz[3]) + '\\\\' plot.plot_pdf_mul_data([feds, randoms, youngs], field, ['--g', '--b', '--r'], ['s', 'o', '^'], ['ED', 'Random', 'Younger'], linear_bins=True, central=True, fit=False, fitranges=None, savefile=field + '.pdf')
def profile_feature_stat(): # 'favourites_count' fields = ['friends_count', 'followers_count', 'statuses_count'] names = ['following', 'follower', 'tweet'] filter = {} fitranges = [[(200, 100000), (1000, 100000000), (800, 10000000)], [(700, 10000), (800, 10000000), (800, 1000000)], [(800, 100000), (20000, 10000000), (10000, 10000000)]] for i in range(len(fields)): field = fields[i] print '=====================', field feds = np.array(io.get_values_one_field('fed', 'scom', field, filter)) + 1 randoms = np.array( io.get_values_one_field('random', 'scom', field, filter)) + 1 youngs = np.array( io.get_values_one_field('young', 'scom', field, filter)) + 1 comm = statis_util.comm_stat(feds) print 'ED & ' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3]) + '\\\\' comm = statis_util.comm_stat(randoms) print 'Random &' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\' comm = statis_util.comm_stat(youngs) print 'Younger &' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\' print '\\hline' # z = statis_util.z_test(randoms, feds) # print 'z-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' # z = statis_util.z_test(youngs, feds) # print 'z-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$:' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' # z = statis_util.z_test(youngs, randoms) # print 'z-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(randoms, feds) print 'ks-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(youngs, feds) print 'ks-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(youngs, randoms) print 'ks-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' plot.plot_pdf_mul_data([feds, randoms, youngs], names[i], ['g', 'b', 'r'], ['s', 'o', '^'], ['ED', 'Random', 'Younger'], linear_bins=False, central=False, fit=True, fitranges=fitranges[i], savefile=field + '.pdf')
def profile_feature_stat(): # 'favourites_count' fields = ['friends_count', 'followers_count', 'statuses_count'] names = ['following', 'follower', 'tweet'] filter = {} fitranges = [[(200, 100000), (1000, 100000000), (800, 10000000)], [(700, 10000), (800, 10000000), (800, 1000000)], [(800, 100000), (20000, 10000000), (10000, 10000000)]] for i in range(len(fields)): field = fields[i] print '=====================', field feds = np.array(io.get_values_one_field('fed', 'scom', field, filter))+1 randoms = np.array(io.get_values_one_field('random', 'scom', field, filter))+1 youngs = np.array(io.get_values_one_field('young', 'scom', field, filter))+1 comm = statis_util.comm_stat(feds) print 'ED & ' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3]) + '\\\\' comm = statis_util.comm_stat(randoms) print 'Random &' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\' comm = statis_util.comm_stat(youngs) print 'Younger &' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\' print '\\hline' # z = statis_util.z_test(randoms, feds) # print 'z-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' # z = statis_util.z_test(youngs, feds) # print 'z-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$:' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' # z = statis_util.z_test(youngs, randoms) # print 'z-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(randoms, feds) print 'ks-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(youngs, feds) print 'ks-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(youngs, randoms) print 'ks-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' plot.plot_pdf_mul_data([feds, randoms, youngs], names[i], ['g', 'b', 'r'], ['s', 'o', '^'], ['ED', 'Random', 'Younger'], linear_bins=False, central=False, fit=True, fitranges=fitranges[i], savefile=field+'.pdf')
def compore_distribution(field, feds, randoms, youngs=None): # print '---------------Compare ' + field + '---------------------' edcomm = statis_util.comm_stat(feds) rdcomm = statis_util.comm_stat(randoms) # ygcomm = statis_util.comm_stat(youngs) ed_rdz = statis_util.ks_test(randoms, feds) # ed_ygz = statis_util.ks_test(youngs, feds) # yg_rdz = statis_util.ks_test(youngs, randoms) # if min(ed_rdz[2], ed_ygz[2])>yg_rdz[2]: # print '%s & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f%s & %.2f%s & %.2f%s \\\\' \ # % (field, edcomm[2], edcomm[3], rdcomm[2], rdcomm[3], ygcomm[2], ygcomm[3], ed_rdz[2], # pvalue(ed_rdz[3]), ed_ygz[2], pvalue(ed_ygz[3]), yg_rdz[2], pvalue(yg_rdz[3])) print '%s & %.2f($\sigma$=%.2f) & %.2f($\sigma$=%.2f) & %.2f%s \\\\' \ % (field, edcomm[2], edcomm[3], rdcomm[2], rdcomm[3], ed_rdz[2], pvalue(ed_rdz[3])) sns.distplot(feds, hist=False, label='Positive') sns.distplot(randoms, hist=False, label='Negative') plt.xlabel('value') plt.ylabel('PDF') plt.savefig('data/' + field + '.pdf') plt.clf()