def gen_month_tweets(stdirname, mtdirname, tiw, trw, min_rtnum): """ 抽取发布后tiw时间内转发数超过minnumrt的原创微博及其trw时间内所有转发微博 """ tiw = datetime.timedelta(seconds=tiw) trw = datetime.timedelta(seconds=trw) for entry in os.listdir(stdirname): pte, ptl = get_pte_ptl(entry) mids = set() rtnums = defaultdict(int) stfilename = os.path.join(stdirname, entry) log('Start collecting mids from %s' % stfilename) with open(stfilename) as stfile: for line in stfile: fields = get_fields(line) if 'rtMid' not in fields: # 原创微博 time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') if pte <= time < ptl: mid = fields['mid'] mids.add(mid) else: # 转发微博 rtmid = fields['rtMid'] if rtmid in mids: rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') if time - rttime < tiw: rtnums[rtmid] += 1 for mid in rtnums.keys(): if rtnums[mid] < min_rtnum: del rtnums[mid] print len(rtnums), sum(rtnums.values()) date = str(pte.date()).rsplit('-', 1)[0] file_pool = FilePool(500, 'a') log('Start generating month tweets and retweets from %s' % stfilename) with open(stfilename) as stfile: for line in stfile: fields = get_fields(line) if 'rtMid' not in fields: # 原创微博 mid = fields['mid'] if mid in rtnums: mtfilename = os.path.join(mtdirname, date, mid) file_pool.write(mtfilename, line) else: # 转发微博 rtmid = fields['rtMid'] if rtmid in rtnums: time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') if time - rttime < trw: mtfilename = os.path.join(mtdirname, date, rtmid) file_pool.write(mtfilename, line) file_pool.close()
def gen_train_test_tweets_by_time(tdirname, trntdirname, tsttdirname, p): """ p: 训练集比例 """ mid_times = {} for entry in os.listdir(tdirname): with open(os.path.join(tdirname, entry)) as fd: firstline = fd.readline() time = get_fields(firstline)['time'] mid_times[entry] = time mids_sorted_by_time = sorted(mid_times.keys(), key=lambda key: mid_times[key]) num_mids = len(mid_times) num_trnmids = int(num_mids * p) trnmids = mids_sorted_by_time[: num_trnmids] tstmids = mids_sorted_by_time[num_trnmids:] for mid in trnmids: srcfilename = os.path.join(tdirname, mid) dstfilename = os.path.join(trntdirname, mid) os.popen('cp %s %s' % (srcfilename, dstfilename)) for mid in tstmids: srcfilename = os.path.join(tdirname, mid) dstfilename = os.path.join(tsttdirname, mid) os.popen('cp %s %s' % (srcfilename, dstfilename))
def gen_user_pop(tdirname, retfilename): pops = defaultdict(list) for entry in os.listdir(tdirname): if entry != '2011-07': # 去掉用于训练和测试的数据 subtdirname = os.path.join(tdirname, entry) log('Collecting data from %s' % subtdirname) for subentry in os.listdir(subtdirname): filename = os.path.join(subtdirname, subentry) with open(filename) as infile: uids = [int(get_fields(line)['uid'].split('\t')[0].split('$')[0]) for line in infile.readlines()] srcuid = uids[0] pops[srcuid].append(len(uids) - 1) log('Computing user popularity') items = [] for uid in pops: pops_ = pops[uid] pop = sum(pops_) / float(len(pops_)) items.append((uid, pop)) items.sort(key=lambda item: item[1], reverse=True) log('Saving results') with open(retfilename, 'w') as retfile: for item in items: retfile.write('%d %.8f\n' % (item[0], item[1]))
def stat_rtnum_dist(tfilename, retfilename): """ 24小时转发数分布 :param tfilename: :param retfilename: :param days: :return: """ mid2rtnum = defaultdict(int) rtnum2tnum = defaultdict(int) td = datetime.timedelta(seconds=24*60*60) log('Start collecting data from %s' % tfilename) with open(tfilename) as tfile: for line in tfile: fields = get_fields(line) if 'rtMid' not in fields: mid = fields['mid'] mid2rtnum[mid] = 0 else: rtmid = fields['rtMid'] time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') if time - rttime < td: # 1小时内 mid2rtnum[rtmid] += 1 for rtnum in mid2rtnum.itervalues(): rtnum2tnum[rtnum] += 1 items = sorted(rtnum2tnum.items(), key=lambda item: item[0]) with open(retfilename, 'w') as retfile: for rtnum, tnum in items: retfile.write('%d\t%d\n' % (rtnum, tnum))
def stat_suid_tweets(tfilename, retfilename): """ 2011.7月原始微博数、转发微博数、用户数 """ n_ori_tweets = 0 n_retweets = 0 mids = set() uids = set() log('Stating collecting data') with open(tfilename) as tfile: n_lines = 0 for line in tfile: # n_lines += 1 # if n_lines > 10000: # break fields = get_fields(line) uid = int(fields['uid'].split('\t')[0].split('$')[0]) uids.add(uid) if 'rtMid' in fields: n_retweets += 1 rtmid = fields['rtMid'] if rtmid not in mids: mids.add(rtmid) n_ori_tweets += 1 rtuid = int(fields['rtUid'].split('$')[0]) uids.add(rtuid) else: n_ori_tweets += 1 with open(retfilename, 'w') as retfile: retfile.write('Original tweets: %d\n' % n_ori_tweets) retfile.write('Retweets: %d\n' % n_retweets) retfile.write('Uids: %d\n' % len(uids))
def get_conf(request, layer_id): layer = Layer.objects.get(id=int(layer_id)) datastore = Datastore.objects.get(id=layer.datastore_id) workspace = Workspace.objects.get(id=datastore.workspace_id) gs = geographic_servers.get_instance().get_server_by_id(workspace.server.id) index = utils.get_next_index(layer) (ds_type, resource) = gs.getResourceInfo(workspace.name, datastore, layer.name, "json") fields = utils.get_fields(resource) if layer.conf: new_fields = [] conf = None if layer and layer.conf: conf = ast.literal_eval(layer.conf) for field in fields: if conf: for f in conf['fields']: if f['name'] == field['name']: for id, language in settings.LANGUAGES: field['title-'+id] = f['title-'+id] else: for id, language in settings.LANGUAGES: field['title-'+id] = field['name'] new_fields.append(field) fields = new_fields feature_type = utils.get_feature_type(fields) alphanumeric_fields = utils.get_alphanumeric_fields(fields) supported_fonts_str = gs.getSupportedFonts() supported_fonts = json.loads(supported_fonts_str) sorted_fonts = utils.sortFontsArray(supported_fonts.get("fonts")) layer_url = core_utils.get_wms_url(workspace) layer_wfs_url = core_utils.get_wfs_url(workspace) preview_url = '' if feature_type == 'PointSymbolizer': preview_url = workspace.server.frontend_url + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_point' elif feature_type == 'LineSymbolizer': preview_url = workspace.server.frontend_url + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_line' elif feature_type == 'PolygonSymbolizer': preview_url = workspace.server.frontend_url + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_polygon' conf = { 'featureType': feature_type, 'fields': alphanumeric_fields, 'json_alphanumeric_fields': json.dumps(alphanumeric_fields), 'fonts': sorted_fonts, 'layer_id': layer_id, 'layer_url': layer_url, 'layer_wfs_url': layer_wfs_url, 'layer_name': workspace.name + ':' + layer.name, 'style_name': workspace.name + '_' + layer.name + '_' + str(index), 'libraries': Library.objects.all(), 'supported_crs': json.dumps(core_utils.get_supported_crs()), 'preview_url': preview_url } return conf
def linechart_update(request, layer_id, chart_id): if request.method == 'POST': layer = Layer.objects.get(id=int(layer_id)) chart = Chart.objects.get(id=int(chart_id)) title = request.POST.get('title') description = request.POST.get('description') chart_conf = request.POST.get('chart_conf') chart.title = title chart.description = description chart.conf = chart_conf chart.save() return HttpResponse(json.dumps({'success': True}, indent=4), content_type='application/json') else: layer = Layer.objects.get(id=int(layer_id)) chart = Chart.objects.get(id=int(chart_id)) layer = Layer.objects.get(id=int(layer_id)) datastore = Datastore.objects.get(id=layer.datastore_id) workspace = Workspace.objects.get(id=datastore.workspace_id) gs = geographic_servers.get_instance().get_server_by_id( workspace.server.id) (ds_type, resource) = gs.getResourceInfo(workspace.name, datastore, layer.name, "json") fields = utils.get_fields(resource) numeric_fields = utils.get_numeric_fields(fields) alpha_numeric_fields = utils.get_alphanumeric_fields(fields) geom_fields = utils.get_geometry_fields(fields) conf = json.loads(chart.conf) y_axis_begin_at_zero = False if 'y_axis_begin_at_zero' in conf: y_axis_begin_at_zero = conf['y_axis_begin_at_zero'] return render( request, 'linechart_update.html', { 'layer_id': layer_id, 'chart_id': chart_id, 'fields': json.dumps(fields), 'numeric_fields': json.dumps(numeric_fields), 'alpha_numeric_fields': json.dumps(alpha_numeric_fields), 'geom_fields': json.dumps(geom_fields), 'title': chart.title, 'description': chart.description, 'dataset_type': conf['dataset_type'], 'x_axis_title': conf['x_axis_title'], 'y_axis_title': conf['y_axis_title'], 'y_axis_begin_at_zero': y_axis_begin_at_zero, 'geographic_names_column': conf['geographic_names_column'], 'geometries_column': conf['geometries_column'], 'selected_columns': json.dumps(conf['columns']) })
def stat_exposure_effect(unfilename, tfilename, retfilename): """ 1小时曝光量对24小时转发数的影响 :param tfilename: suid tweet :return: """ tiw = datetime.timedelta(seconds=60*60) trw = datetime.timedelta(seconds=24*60*60) mid2oritweet = defaultdict(lambda: [list(), 0]) # mid: [ti_uids, tr_rtnum] ti时刻转发数大于等于1 with open(tfilename) as tfile: n_lines = 0 for line in tfile: # n_lines += 1 # if n_lines > 100000: # break fields = get_fields(line) if 'rtMid' in fields: # retweet rtmid = fields['rtMid'] rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') if time - rttime < tiw: # 0 - ti 间转发 uids = [int(_.split('$')[0]) for _ in fields['uid'].split('\t')] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][0].extend(uids) mid2oritweet[rtmid][1] += 1 else: # not added yet rtuid = int(fields['rtUid'].split('$')[0]) uids.append(rtuid) mid2oritweet[rtmid] = [uids, 1] elif time - rttime < trw: # ti - tr 间转发 if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][1] += 1 expnum_trrtnums = [] log('Start reading user network') fin = snap.TFIn(unfilename) g = snap.TNGraph.Load(fin) for uids, tr_rtnum in mid2oritweet.itervalues(): exp_nodes = set() rg, rrg, bg = get_rg_rrg_bg(uids, g) for ni in rg.Nodes(): nid = ni.GetId() gni = g.GetNI(nid) for i in range(gni.GetInDeg()): nbrnid = gni.GetInNId(i) if not rg.IsNode(nbrnid): exp_nodes.add(nbrnid) expnum_trrtnums.append((len(exp_nodes), tr_rtnum)) expnum_trrtnums = set(expnum_trrtnums) expnum_trrtnums = sorted(expnum_trrtnums, cmp=lambda x, y: (x[0] - y[0]) or (x[1] - y[1])) with open(retfilename, 'w') as retfile: for expnum, tr_rtnum in expnum_trrtnums: retfile.write('%10d %10d\n' % (expnum, tr_rtnum))
def get_tweet_authors(tdirname): authors = defaultdict(int) for entry in os.listdir(tdirname): with open(os.path.join(tdirname, entry)) as tfile: firstline = tfile.readline() fields = get_fields(firstline) author = fields['uid'] authors[author] += 1 return authors
def stat_rtnum_curve(tfilename, retfilename): """ 归一化转发数曲线 :param tfilename: :param retfilename: :return: """ mid2rts = defaultdict(list) td = datetime.timedelta(seconds=72*60*60) log('Start collecting data from %s' % tfilename) with open(tfilename) as tfile: # i = 0 for line in tfile: # i += 1 # if i > 2000000: # break fields = get_fields(line) if 'rtMid' not in fields: # original tweet mid = fields['mid'] time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') mid2rts[mid].append(time) else: rtmid = fields['rtMid'] rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') if time - rttime < td: # 72小时内 if rtmid in mid2rts: mid2rts[rtmid].append(time) else: mid2rts[rtmid] = [rttime, time] n_effective = 0 normalized_curve = [0.0] * (72 * 60 + 1) for rts in mid2rts.itervalues(): curve = [0.0] * (72 * 60 + 1) if len(rts) > 1: n_effective += 1 for rt in rts[1:]: td = rt - rts[0] td = (td.days * 24 * 60 * 60 + td.seconds + 60) / 60 if td <= 4320: curve[td] += 1 else: curve[-1] += 1 for i in range(1, len(curve)): curve[i] += curve[i - 1] for i in range(len(curve)): curve[i] /= curve[-1] normalized_curve[i] += curve[i] for i in range(len(normalized_curve)): normalized_curve[i] /= n_effective with open(retfilename, 'w') as retfile: for p in normalized_curve: retfile.write('%.20f\n' % p)
def gen_user_network(suidfilename, rawunfilename, stdirname, m, unfilename): log('Start collecting short uids') suids = {} with open(suidfilename) as suidfile: for line in suidfile: uid, suid = line.strip().split() suids[uid] = int(suid) log('Start extracting user network from raw user network file') g = snap.TNGraph.New() with open(rawunfilename) as rawunfile: for line in rawunfile: follower_uid, followee_uid = line.strip().split() follower_suid = suids.get(follower_uid, -1) followee_suid = suids.get(followee_uid, -1) if follower_suid != -1 and not g.IsNode(follower_suid): g.AddNode(follower_suid) if followee_suid != -1 and not g.IsNode(followee_suid): g.AddNode(followee_suid) if follower_suid != -1 and followee_suid != -1 and follower_suid != followee_suid: # 防止生成自环 g.AddEdge(follower_suid, followee_suid) log('Start collecting user network from tweets') mention_nums = defaultdict(lambda: defaultdict(int)) for entry in os.listdir(stdirname): tfilename = os.path.join(stdirname, entry) log('Start collecting user network from %s' % tfilename) with open(tfilename) as tfile: for line in tfile: fields = get_fields(line) for uids in [_.split('$') for _ in fields['uid'].split('\t')]: uids = [int(_) for _ in uids] for uid in uids[1:]: mention_nums[uids[0]][uid] += 1 if 'rtUid' in fields: uids = fields['rtUid'].split('$') uids = [int(_) for _ in uids] for uid in uids[1:]: mention_nums[uids[0]][uid] += 1 for uid1 in mention_nums: for uid2, mention_num in mention_nums[uid1].items(): if not g.IsNode(uid1): g.AddNode(uid1) if not g.IsNode(uid2): g.AddNode(uid2) if uid1 != uid2 and mention_num >= m: g.AddEdge(uid1, uid2) print '# Nodes is %d' % g.GetNodes() print '# Edges is %d' % g.GetEdges() log('Start writing user network') fout = snap.TFOut(unfilename) g.Save(fout) fout.Flush()
def process_sam_line(line): if not line.startswith(comment_character): fields = utils.get_fields(line, min_fields) if fields == None: return None chromosome = fields[2] position = int(fields[3]) cigar_string = parse_cigar(fields[5]) insertions = get_insertions(cigar_string, chromosome, position) deletions = get_deletions(cigar_string, chromosome, position) return insertions + deletions
def stat_mention_effect1(tfilename, retfilename): """ 转发链中微博平均@人数与24小时转发数的关系 :param tfilename: suid tweet :return: """ tiw = datetime.timedelta(seconds=60*60) trw = datetime.timedelta(seconds=24*60*60) mid2oritweet = {} # mid: [ti_rtnum, tr_rtnum, ti_ori_n_mentions, ti_n_mentions] with open(tfilename) as tfile: n_lines = 0 for line in tfile: # n_lines += 1 # if n_lines > 10000: # break fields = get_fields(line) if 'rtMid' in fields: # retweet rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') if time - rttime < tiw: # 0 - ti 间转发 rtmid = fields['rtMid'] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][0] += 1 mid2oritweet[rtmid][1] += 1 mid2oritweet[rtmid][3] += len(fields['uid'].split('\t')[0].split('$')) - 1 else: # not added yet ti_ori_n_mentions = len(fields['rtUid'].split('$')) - 1 ti_n_mentions = len(fields['uid'].split('\t')[0].split('$')) - 1 + ti_ori_n_mentions mid2oritweet[rtmid] = [1, 1, ti_ori_n_mentions, ti_n_mentions] elif time - rttime < trw: # ti - tr 间转发 rtmid = fields['rtMid'] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][1] += 1 else: # not added yet ti_ori_n_mentions = len(fields['rtUid'].split('$')) - 1 mid2oritweet[rtmid] = [0, 1, ti_ori_n_mentions, 0] else: # original tweet mid = fields['mid'] mid2oritweet[mid] = [0, 0, len(fields['uid'].split('$')) - 1, 0] avg_n_mentions2tr_rtnums = defaultdict(list) for mid, (ti_rtnum, tr_rtnum, ti_ori_n_mentions, ti_n_mentions) in mid2oritweet.iteritems(): ti_avg_n_mentions = ti_n_mentions / float(ti_rtnum + 1) avg_n_mentions2tr_rtnums[ti_avg_n_mentions].append(tr_rtnum) items = sorted(avg_n_mentions2tr_rtnums.items(), key=lambda item: item[0]) with open(retfilename, 'w') as retfile: for ti_avg_n_mentions, tr_rtnums in items: avg_tr_rtnum = np.mean(tr_rtnums) retfile.write('%.4f %.4f\n' % (ti_avg_n_mentions, avg_tr_rtnum))
def stat_follower_effect(unfilename, tfilename, retfilename): """ 原作者粉丝数对24小时转发数的影响 :param tfilename: suid tweet :return: """ trw = datetime.timedelta(seconds=24*60*60) mid2oritweet = defaultdict(lambda: [0, 0]) # mid: [fo_num, tr_rtnum] log('Start reading user network') fin = snap.TFIn(unfilename) g = snap.TNGraph.Load(fin) with open(tfilename) as tfile: n_lines = 0 for line in tfile: # n_lines += 1 # if n_lines > 100000: # break fields = get_fields(line) if 'rtMid' in fields: # retweet rtmid = fields['rtMid'] rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') if time - rttime < trw: # 0 - tr 间转发 if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][1] += 1 else: # not added yet rtuid = int(fields['rtUid'].split('$')[0]) if g.IsNode(rtuid): fo_num = g.GetNI(rtuid).GetInDeg() else: fo_num = 0 mid2oritweet[rtmid] = [fo_num, 1] # else: # original tweet # mid = fields['mid'] # uid = int(fields['uid'].split('$')[0]) # fo_num = g.GetNI(uid).GetInDeg() # mid2oritweet[mid] = [fo_num, 0] fo_num_tr_rtnums = list(set([tuple(_) for _ in mid2oritweet.values()])) fo_num_tr_rtnums.sort(cmp=lambda x, y: (x[0] - y[0]) or (x[1] - y[1])) with open(retfilename, 'w') as retfile: for fo_num, tr_rtnum in fo_num_tr_rtnums: retfile.write('%10d %10d\n' % (fo_num, tr_rtnum))
def stat_avg_tweet_num_per_hour(tfilename, retfilename, days): pt2tnum = defaultdict(int) ts = 60 * 60 log('Start collecting data from %s' % tfilename) with open(tfilename) as tfile: for line in tfile: fields = get_fields(line) time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') secs = time.hour * 60 * 60 + time.minute * 60 + time.second pt = secs / ts pt2tnum[pt] += 1 with open(retfilename, 'w') as retfile: for pt in sorted(pt2tnum.keys()): avgtnum = pt2tnum[pt] / float(days) retfile.write('%d\t%.2f\n' % (pt, avgtnum))
def stat_tirtnum_effect(tfilename, retfilename): """ :param tfilename: suid tweet :return: """ tiw = datetime.timedelta(seconds=60*60) trw = datetime.timedelta(seconds=24*60*60) mid2oritweet = {} # mid: [ti_rtnum, tr_rtnum] with open(tfilename) as tfile: n_lines = 0 for line in tfile: # n_lines += 1 # if n_lines > 10000: # break fields = get_fields(line) if 'rtMid' in fields: # retweet rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') if time - rttime < tiw: # 0 - ti 间转发 rtmid = fields['rtMid'] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][0] += 1 mid2oritweet[rtmid][1] += 1 else: # not added yet mid2oritweet[rtmid] = [1, 1] elif time - rttime < trw: # ti - tr 间转发 rtmid = fields['rtMid'] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][1] += 1 else: # not added yet mid2oritweet[rtmid] = [0, 1] else: # original tweet mid = fields['mid'] mid2oritweet[mid] = [0, 0] ti_rtnum2tr_rtnums = defaultdict(list) for mid, (ti_rtnum, tr_rtnum) in mid2oritweet.iteritems(): ti_rtnum2tr_rtnums[ti_rtnum].append(tr_rtnum) items = sorted(ti_rtnum2tr_rtnums.items(), key=lambda item: item[0]) with open(retfilename, 'w') as retfile: for ti_rtnum, tr_rtnums in items: avg_tr_rtnum = np.mean(tr_rtnums) retfile.write('%.4f %.4f\n' % (ti_rtnum, avg_tr_rtnum))
def stat_time_interval_effect2(tfilename, retfilename): """ 微博1小时内平均转发时间间隔与24小时转发数的关系 :param tfilename: suid tweet :return: """ tiw = datetime.timedelta(seconds=60*60) trw = datetime.timedelta(seconds=24*60*60) mid2oritweet = {} # mid: [time_interval, tr_rtnum] with open(tfilename) as tfile: n_lines = 0 for line in tfile: # n_lines += 1 # if n_lines > 10000: # break fields = get_fields(line) if 'rtMid' in fields: # retweet rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') td = time - rttime if td < tiw: rtmid = fields['rtMid'] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][0] = min(mid2oritweet[rtmid][0], td.seconds) mid2oritweet[rtmid][1] += 1 else: # not added yet td = td.seconds mid2oritweet[rtmid] = [td, 1] elif td < trw: # 0 - tr 间转发 rtmid = fields['rtMid'] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][1] += 1 else: # original tweet pass deduped = set() for mid, value in mid2oritweet.iteritems(): deduped.add(tuple(value)) items = sorted(deduped, cmp=lambda x, y: (x[0] - y[0]) or (x[1] - y[1])) with open(retfilename, 'w') as retfile: for time_interval, tr_rtnum in items: retfile.write('%d %.4f\n' % (time_interval, tr_rtnum))
def piechart_add(request, layer_id): if request.method == 'POST': layer = Layer.objects.get(id=int(layer_id)) title = request.POST.get('title') description = request.POST.get('description') chart_conf = request.POST.get('chart_conf') chart = Chart(layer=layer, type='piechart', title=title, description=description, conf=chart_conf) chart.save() return HttpResponse(json.dumps({'success': True}, indent=4), content_type='application/json') else: layer = Layer.objects.get(id=int(layer_id)) layer = Layer.objects.get(id=int(layer_id)) datastore = Datastore.objects.get(id=layer.datastore_id) workspace = Workspace.objects.get(id=datastore.workspace_id) gs = geographic_servers.get_instance().get_server_by_id( workspace.server.id) (ds_type, resource) = gs.getResourceInfo(workspace.name, datastore, layer.name, "json") fields = utils.get_fields(resource) numeric_fields = utils.get_numeric_fields(fields) alpha_numeric_fields = utils.get_alphanumeric_fields(fields) geom_fields = utils.get_geometry_fields(fields) conf = { 'layer_id': layer_id, 'fields': json.dumps(fields), 'numeric_fields': json.dumps(numeric_fields), 'alpha_numeric_fields': json.dumps(alpha_numeric_fields), 'geom_fields': json.dumps(geom_fields) } return render(request, 'piechart_add.html', conf)
def process_vcf_line(line): if not line.startswith(comment_character): fields = utils.get_fields(line, min_fields) if fields == None: return None chromosome = fields[0] position = int(fields[1]) type = None length = None reference_bases = fields[3] alternative_bases = fields[4] if len(alternative_bases) > len(reference_bases): # Insertion type = utils.insertion_type length = len(alternative_bases) - len(reference_bases) elif len(reference_bases) > len(alternative_bases): # Deletion type = utils.deletion_type length = len(reference_bases) - len(alternative_bases) else: return None return [[chromosome, position, type, length]] # for compatibility
def stat_mention_effect2(tfilename, retfilename): """ 原始微博@人数与24小时转发数的关系 :param tfilename: suid tweet :return: """ tiw = datetime.timedelta(seconds=60*60) trw = datetime.timedelta(seconds=24*60*60) mid2oritweet = {} # mid: [n_mentions, tr_rtnum] with open(tfilename) as tfile: n_lines = 0 for line in tfile: # n_lines += 1 # if n_lines > 10000: # break fields = get_fields(line) if 'rtMid' in fields: # retweet rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S') time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S') if time - rttime < trw: # 0 - tr 间转发 rtmid = fields['rtMid'] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][1] += 1 else: # not added yet n_mentions = len(fields['rtUid'].split('$')) - 1 mid2oritweet[rtmid] = [n_mentions, 1] else: # original tweet mid = fields['mid'] n_mentions = len(fields['uid'].split('$')) - 1 mid2oritweet[mid] = [n_mentions, 0] deduped = set() for mid, value in mid2oritweet.iteritems(): deduped.add(tuple(value)) items = sorted(deduped, cmp=lambda x, y: (x[0] - y[0]) or (x[1] - y[1])) with open(retfilename, 'w') as retfile: for n_mentions, tr_rtnum in items: retfile.write('%d %.4f\n' % (n_mentions, tr_rtnum))
def __getitem__(self, idx): # returns sequence of n_lag values and 1 value after (which is to be predicted) fields = get_fields(self.paths[idx], ['closing_prices', 'volume']) prices = torch.Tensor(fields['closing_prices']) volume = torch.Tensor(fields['volume']) n_values = fields['n_values'] def get_io(n_lag): if n_values-n_lag < 1: return None, None, None, None np.random.seed() #start_idx = 100*np.random.randint(2) # TODO test only start_idx = np.random.randint(0, n_values-n_lag) stop_idx = start_idx + n_lag return prices[start_idx:stop_idx], prices[stop_idx], volume[start_idx:stop_idx], volume[stop_idx] last_prices, next_price, last_volumes, next_volume = get_io(self.n_lag) sample = {'last_prices': last_prices, 'next_price':next_price, \ 'last_volumes': last_volumes, 'next_volume' : next_volume} return sample
def get_conf(request, layer_id): layer = Layer.objects.get(id=int(layer_id)) datastore = Datastore.objects.get(id=layer.datastore_id) workspace = Workspace.objects.get(id=datastore.workspace_id) index = len(StyleLayer.objects.filter(layer=layer)) styleLayers = StyleLayer.objects.filter(layer=layer) for style_layer in styleLayers: aux_name = style_layer.style.name aux_name = aux_name.replace(workspace.name + '_' + layer.name + '_', '') try: aux_index = int(aux_name) if index < aux_index + 1: index = aux_index + 1 except ValueError: print "Error getting index" (ds_type, resource) = mapservice.getResourceInfo(workspace.name, datastore, layer.name, "json") fields = utils.get_fields(resource) if layer.conf: new_fields = [] conf = None if layer and layer.conf: conf = ast.literal_eval(layer.conf) for field in fields: if conf: for f in conf['fields']: if f['name'] == field['name']: for id, language in settings.LANGUAGES: field['title-' + id] = f['title-' + id] else: for id, language in settings.LANGUAGES: field['title-' + id] = field['name'] new_fields.append(field) fields = new_fields feature_type = utils.get_feature_type(fields) alphanumeric_fields = utils.get_alphanumeric_fields(fields) supported_fonts_str = mapservice.getSupportedFonts() supported_fonts = json.loads(supported_fonts_str) sorted_fonts = utils.sortFontsArray(supported_fonts.get("fonts")) layer_url = core_utils.get_wms_url(request, workspace) layer_wfs_url = core_utils.get_wfs_url(request, workspace) preview_url = '' if feature_type == 'PointSymbolizer': preview_url = settings.GVSIGOL_SERVICES[ 'URL'] + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_point' elif feature_type == 'LineSymbolizer': preview_url = settings.GVSIGOL_SERVICES[ 'URL'] + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_line' elif feature_type == 'PolygonSymbolizer': preview_url = settings.GVSIGOL_SERVICES[ 'URL'] + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_polygon' conf = { 'featureType': feature_type, 'fields': alphanumeric_fields, 'json_alphanumeric_fields': json.dumps(alphanumeric_fields), 'fonts': sorted_fonts, 'layer_id': layer_id, 'layer_url': layer_url, 'layer_wfs_url': layer_wfs_url, 'layer_name': workspace.name + ':' + layer.name, 'style_name': workspace.name + '_' + layer.name + '_' + str(index), 'libraries': Library.objects.all(), 'supported_crs': json.dumps(core_utils.get_supported_crs()), 'preview_url': preview_url } return conf
def stat_link_pop(tfilename, retfilename): """ :param tfilename: suid tweet :return: """ tiw = datetime.timedelta(seconds=24 * 60 * 60) n_tweets = 0 n_tweets_with_link = 0 mid2oritweet = {} # mid: [pop, with_link] with open(tfilename) as tfile: for line in tfile: fields = get_fields(line) if "rtMid" in fields: # retweet rttime = datetime.datetime.strptime(fields["rtTime"], "%Y-%m-%d %H:%M:%S") time = datetime.datetime.strptime(fields["time"], "%Y-%m-%d %H:%M:%S") if time - rttime >= tiw: # 超过24小时,不作处理 continue rtmid = fields["rtMid"] if rtmid in mid2oritweet: mid2oritweet[rtmid][0] += 1 else: if fields["rtIsContainLink"] == "true": mid2oritweet[rtmid] = [1, True] n_tweets_with_link += 1 else: mid2oritweet[rtmid] = [1, False] n_tweets += 1 if fields["isContainLink"] == "true": n_tweets_with_link += 1 else: # original tweet mid = fields["mid"] if fields["isContainLink"] == "true": mid2oritweet[mid] = [0, True] else: mid2oritweet[mid] = [0, False] n_tweets += 1 n_oritweets_without_link = 0 n_oritweets_with_link = 0 pops_oritweets_without_link = [] pops_oritweets_with_link = [] for mid, (pop, iscontainlink) in mid2oritweet.iteritems(): if iscontainlink: n_oritweets_with_link += 1 pops_oritweets_with_link.append(pop) else: n_oritweets_without_link += 1 pops_oritweets_without_link.append(pop) mid_pop_oritweets_with_link = sorted(pops_oritweets_with_link)[len(pops_oritweets_with_link) / 2] mid_pop_oritweets_without_link = sorted(pops_oritweets_without_link)[len(pops_oritweets_without_link) / 2] with open(retfilename, "w") as retfile: retfile.write("n_tweets: %d\n" % n_tweets) retfile.write("n_tweets_with_link: %d\n" % n_tweets_with_link) retfile.write("n_original_tweets: %d\n" % (n_oritweets_with_link + n_oritweets_without_link)) retfile.write("n_original_tweets_with_link: %d\n" % n_oritweets_with_link) retfile.write("n_original_tweets_without_link: %d\n" % n_oritweets_without_link) retfile.write("mid_pop_oritweets_with_link: %f\n" % mid_pop_oritweets_with_link) retfile.write("mid_pop_oritweets_without_link: %f\n" % mid_pop_oritweets_without_link)
def stat_url_effect(tfilename, retfilename): """ 包含链接和不包含链接的微博的被转发情况 :param tfilename: suid tweet :return: """ tiw = datetime.timedelta(seconds=60 * 60) trw = datetime.timedelta(seconds=24 * 60 * 60) mid2oritweet = {} # mid: [ti_pop, tr_pop, ti_n_tweet_with_link, ti_ori_tweet_with_link] with open(tfilename) as tfile: n_lines = 0 for line in tfile: # n_lines += 1 # if n_lines > 10000: # break fields = get_fields(line) if "rtMid" in fields: # retweet rttime = datetime.datetime.strptime(fields["rtTime"], "%Y-%m-%d %H:%M:%S") time = datetime.datetime.strptime(fields["time"], "%Y-%m-%d %H:%M:%S") td = time - rttime if td < tiw: # 0 - ti 间转发 rtmid = fields["rtMid"] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][0] += 1 mid2oritweet[rtmid][1] += 1 if fields["isContainLink"] == "true": mid2oritweet[rtmid][2] += 1 else: # not added yet if fields["rtIsContainLink"] == "true" and fields["isContainLink"] == "true": mid2oritweet[rtmid] = [1, 1, 2, 1] elif fields["rtIsContainLink"] == "true" and fields["isContainLink"] == "false": mid2oritweet[rtmid] = [1, 1, 1, 1] elif fields["rtIsContainLink"] == "false" and fields["isContainLink"] == "true": mid2oritweet[rtmid] = [1, 1, 1, 0] else: mid2oritweet[rtmid] = [1, 1, 0, 0] elif td < trw: # ti - tr 间转发 rtmid = fields["rtMid"] if rtmid in mid2oritweet: # already added mid2oritweet[rtmid][1] += 1 else: # not added yet if fields["rtIsContainLink"] == "true": mid2oritweet[rtmid] = [0, 1, 1, 1] else: mid2oritweet[rtmid] = [0, 1, 0, 0] else: # tr 之后转发 rtmid = fields["rtMid"] if rtmid in mid2oritweet: # already added pass else: # not added yet if fields["rtIsContainLink"] == "true": mid2oritweet[rtmid] = [0, 0, 0, 1] else: mid2oritweet[rtmid] = [0, 0, 0, 0] else: # original tweet mid = fields["mid"] if fields["isContainLink"] == "true": mid2oritweet[mid] = [0, 0, 1, 1] else: mid2oritweet[mid] = [0, 0, 0, 0] ratio2tr_pops = defaultdict(list) for mid, (ti_pop, tr_pop, ti_n_link, ti_ori_with_link) in mid2oritweet.iteritems(): ratio = ti_n_link / float(ti_pop + 1) ratio2tr_pops[ratio].append(tr_pop) n_ori_tweets_with_link = sum([mid2oritweet[mid][3] for mid in mid2oritweet]) n_ori_tweets_with_link_retweeted = sum([mid2oritweet[mid][3] for mid in mid2oritweet if mid2oritweet[mid][1] > 0]) n_retweets_of_ori_tweets_with_link = sum( [mid2oritweet[mid][1] for mid in mid2oritweet if mid2oritweet[mid][3] == 1] ) n_ori_tweets_without_link_retweeted = sum( [1 for mid in mid2oritweet if mid2oritweet[mid][1] > 0 and mid2oritweet[mid][3] == 0] ) n_retweets_of_ori_tweets_without_link = sum( [mid2oritweet[mid][1] for mid in mid2oritweet if mid2oritweet[mid][3] == 0] ) n_ori_tweets_without_link = len(mid2oritweet) - n_ori_tweets_with_link items = sorted(ratio2tr_pops.items(), key=lambda item: item[0]) with open(retfilename, "w") as retfile: for ratio, tr_pops in items: avg_tr_pop = np.mean(tr_pops) retfile.write("%.4f %.4f\n" % (ratio, avg_tr_pop)) retfile.write("\n" + "=" * 20 + "\n") retfile.write("n_ori_tweets_with_link: %d\n" % n_ori_tweets_with_link) retfile.write("n_ori_tweets_with_link_retweeted: %d\n" % n_ori_tweets_with_link_retweeted) retfile.write("n_retweets_of_ori_tweets_with_link: %d\n" % n_retweets_of_ori_tweets_with_link) retfile.write( "retweet_ratio_for_ori_tweets_with_link: %.8f\n" % (float(n_ori_tweets_with_link_retweeted) / n_ori_tweets_with_link) ) retfile.write( "avg_n_retweet_of_ori_tweets_with_link: %.8f\n" % (float(n_retweets_of_ori_tweets_with_link) / n_ori_tweets_with_link) ) retfile.write("n_ori_tweets_without_link: %d\n" % n_ori_tweets_without_link) retfile.write("n_ori_tweets_without_link_retweeted: %d\n" % n_ori_tweets_without_link_retweeted) retfile.write("n_retweets_of_ori_tweets_without_link: %d\n" % n_retweets_of_ori_tweets_without_link) retfile.write( "retweet_ratio_for_ori_tweets_without_link: %.8f\n" % (float(n_ori_tweets_without_link_retweeted) / n_ori_tweets_without_link) ) retfile.write( "avg_n_retweet_of_ori_tweets_without_link: %.8f\n" % (float(n_retweets_of_ori_tweets_without_link) / n_ori_tweets_without_link) )