def gen_month_tweets(stdirname, mtdirname, tiw, trw, min_rtnum):
    """
    抽取发布后tiw时间内转发数超过minnumrt的原创微博及其trw时间内所有转发微博
    """
    tiw = datetime.timedelta(seconds=tiw)
    trw = datetime.timedelta(seconds=trw)

    for entry in os.listdir(stdirname):
        pte, ptl = get_pte_ptl(entry)
        mids = set()
        rtnums = defaultdict(int)
        stfilename = os.path.join(stdirname, entry)

        log('Start collecting mids from %s' % stfilename)
        with open(stfilename) as stfile:
            for line in stfile:
                fields = get_fields(line)
                if 'rtMid' not in fields:  # 原创微博
                    time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                    if pte <= time < ptl:
                        mid = fields['mid']
                        mids.add(mid)
                else:  # 转发微博
                    rtmid = fields['rtMid']
                    if rtmid in mids:
                        rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                        time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                        if time - rttime < tiw:
                            rtnums[rtmid] += 1

        for mid in rtnums.keys():
            if rtnums[mid] < min_rtnum:
                del rtnums[mid]

        print len(rtnums), sum(rtnums.values())

        date = str(pte.date()).rsplit('-', 1)[0]
        file_pool = FilePool(500, 'a')
        log('Start generating month tweets and retweets from %s' % stfilename)
        with open(stfilename) as stfile:
            for line in stfile:
                fields = get_fields(line)
                if 'rtMid' not in fields:  # 原创微博
                    mid = fields['mid']
                    if mid in rtnums:
                        mtfilename = os.path.join(mtdirname, date, mid)
                        file_pool.write(mtfilename, line)
                else:  # 转发微博
                    rtmid = fields['rtMid']
                    if rtmid in rtnums:
                        time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                        rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                        if time - rttime < trw:
                            mtfilename = os.path.join(mtdirname, date, rtmid)
                            file_pool.write(mtfilename, line)
        file_pool.close()
def gen_train_test_tweets_by_time(tdirname, trntdirname, tsttdirname, p):
    """
    p: 训练集比例
    """

    mid_times = {}
    for entry in os.listdir(tdirname):
        with open(os.path.join(tdirname, entry)) as fd:
            firstline = fd.readline()
            time = get_fields(firstline)['time']
            mid_times[entry] = time

    mids_sorted_by_time = sorted(mid_times.keys(), key=lambda key: mid_times[key])
    num_mids = len(mid_times)
    num_trnmids = int(num_mids * p)
    trnmids = mids_sorted_by_time[: num_trnmids]
    tstmids = mids_sorted_by_time[num_trnmids:]

    for mid in trnmids:
        srcfilename = os.path.join(tdirname, mid)
        dstfilename = os.path.join(trntdirname, mid)
        os.popen('cp %s %s' % (srcfilename, dstfilename))

    for mid in tstmids:
        srcfilename = os.path.join(tdirname, mid)
        dstfilename = os.path.join(tsttdirname, mid)
        os.popen('cp %s %s' % (srcfilename, dstfilename))
示例#3
0
def gen_user_pop(tdirname, retfilename):
    pops = defaultdict(list)
    for entry in os.listdir(tdirname):
        if entry != '2011-07':  # 去掉用于训练和测试的数据
            subtdirname = os.path.join(tdirname, entry)
            log('Collecting data from %s' % subtdirname)
            for subentry in os.listdir(subtdirname):
                filename = os.path.join(subtdirname, subentry)
                with open(filename) as infile:
                    uids = [int(get_fields(line)['uid'].split('\t')[0].split('$')[0]) for line in infile.readlines()]
                    srcuid = uids[0]
                    pops[srcuid].append(len(uids) - 1)

    log('Computing user popularity')
    items = []
    for uid in pops:
        pops_ = pops[uid]
        pop = sum(pops_) / float(len(pops_))
        items.append((uid, pop))
    items.sort(key=lambda item: item[1], reverse=True)

    log('Saving results')
    with open(retfilename, 'w') as retfile:
        for item in items:
            retfile.write('%d %.8f\n' % (item[0], item[1]))
def stat_rtnum_dist(tfilename, retfilename):
    """
    24小时转发数分布
    :param tfilename:
    :param retfilename:
    :param days:
    :return:
    """
    mid2rtnum = defaultdict(int)
    rtnum2tnum = defaultdict(int)
    td = datetime.timedelta(seconds=24*60*60)

    log('Start collecting data from %s' % tfilename)
    with open(tfilename) as tfile:
        for line in tfile:
            fields = get_fields(line)
            if 'rtMid' not in fields:
                mid = fields['mid']
                mid2rtnum[mid] = 0
            else:
                rtmid = fields['rtMid']
                time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                if time - rttime < td:  # 1小时内
                    mid2rtnum[rtmid] += 1

    for rtnum in mid2rtnum.itervalues():
        rtnum2tnum[rtnum] += 1

    items = sorted(rtnum2tnum.items(), key=lambda item: item[0])
    with open(retfilename, 'w') as retfile:
        for rtnum, tnum in items:
            retfile.write('%d\t%d\n' % (rtnum, tnum))
def stat_suid_tweets(tfilename, retfilename):
    """
    2011.7月原始微博数、转发微博数、用户数
    """
    n_ori_tweets = 0
    n_retweets = 0
    mids = set()
    uids = set()
    log('Stating collecting data')
    with open(tfilename) as tfile:
        n_lines = 0
        for line in tfile:
            # n_lines += 1
            # if n_lines > 10000:
            #     break
            fields = get_fields(line)
            uid = int(fields['uid'].split('\t')[0].split('$')[0])
            uids.add(uid)
            if 'rtMid' in fields:
                n_retweets += 1
                rtmid = fields['rtMid']
                if rtmid not in mids:
                    mids.add(rtmid)
                    n_ori_tweets += 1
                rtuid = int(fields['rtUid'].split('$')[0])
                uids.add(rtuid)
            else:
                n_ori_tweets += 1

    with open(retfilename, 'w') as retfile:
        retfile.write('Original tweets: %d\n' % n_ori_tweets)
        retfile.write('Retweets: %d\n' % n_retweets)
        retfile.write('Uids: %d\n' % len(uids))
def get_conf(request, layer_id):
    layer = Layer.objects.get(id=int(layer_id))
    datastore = Datastore.objects.get(id=layer.datastore_id)
    workspace = Workspace.objects.get(id=datastore.workspace_id)
    gs = geographic_servers.get_instance().get_server_by_id(workspace.server.id)
    index = utils.get_next_index(layer)
    (ds_type, resource) = gs.getResourceInfo(workspace.name, datastore, layer.name, "json")
    fields = utils.get_fields(resource)
    if layer.conf:
        new_fields = []
        conf = None
        if layer and layer.conf:
            conf = ast.literal_eval(layer.conf)
        for field in fields:
            if conf:
                for f in conf['fields']:
                    if f['name'] == field['name']:
                        for id, language in settings.LANGUAGES:
                            field['title-'+id] = f['title-'+id]
            else:
                for id, language in settings.LANGUAGES:
                    field['title-'+id] = field['name']
            new_fields.append(field)
        fields = new_fields
        
    feature_type = utils.get_feature_type(fields)
    alphanumeric_fields = utils.get_alphanumeric_fields(fields)
       
    supported_fonts_str = gs.getSupportedFonts()
    supported_fonts = json.loads(supported_fonts_str)
    sorted_fonts = utils.sortFontsArray(supported_fonts.get("fonts"))
              
    layer_url = core_utils.get_wms_url(workspace)
    layer_wfs_url = core_utils.get_wfs_url(workspace)
    
    preview_url = ''
    if feature_type == 'PointSymbolizer':
        preview_url = workspace.server.frontend_url + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_point'    
    elif feature_type == 'LineSymbolizer':      
        preview_url = workspace.server.frontend_url + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_line'     
    elif feature_type == 'PolygonSymbolizer': 
        preview_url = workspace.server.frontend_url + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_polygon'
                  
    conf = {
        'featureType': feature_type,
        'fields': alphanumeric_fields,
        'json_alphanumeric_fields': json.dumps(alphanumeric_fields),
        'fonts': sorted_fonts,
        'layer_id': layer_id,
        'layer_url': layer_url,
        'layer_wfs_url': layer_wfs_url,
        'layer_name': workspace.name + ':' + layer.name,
        'style_name': workspace.name + '_' + layer.name + '_' + str(index),
        'libraries': Library.objects.all(),
        'supported_crs': json.dumps(core_utils.get_supported_crs()),
        'preview_url': preview_url
    }    
     
    return conf
示例#7
0
def linechart_update(request, layer_id, chart_id):
    if request.method == 'POST':
        layer = Layer.objects.get(id=int(layer_id))
        chart = Chart.objects.get(id=int(chart_id))

        title = request.POST.get('title')
        description = request.POST.get('description')
        chart_conf = request.POST.get('chart_conf')

        chart.title = title
        chart.description = description
        chart.conf = chart_conf

        chart.save()

        return HttpResponse(json.dumps({'success': True}, indent=4),
                            content_type='application/json')

    else:
        layer = Layer.objects.get(id=int(layer_id))
        chart = Chart.objects.get(id=int(chart_id))

        layer = Layer.objects.get(id=int(layer_id))
        datastore = Datastore.objects.get(id=layer.datastore_id)
        workspace = Workspace.objects.get(id=datastore.workspace_id)
        gs = geographic_servers.get_instance().get_server_by_id(
            workspace.server.id)

        (ds_type, resource) = gs.getResourceInfo(workspace.name, datastore,
                                                 layer.name, "json")
        fields = utils.get_fields(resource)
        numeric_fields = utils.get_numeric_fields(fields)
        alpha_numeric_fields = utils.get_alphanumeric_fields(fields)
        geom_fields = utils.get_geometry_fields(fields)

        conf = json.loads(chart.conf)

        y_axis_begin_at_zero = False
        if 'y_axis_begin_at_zero' in conf:
            y_axis_begin_at_zero = conf['y_axis_begin_at_zero']

        return render(
            request, 'linechart_update.html', {
                'layer_id': layer_id,
                'chart_id': chart_id,
                'fields': json.dumps(fields),
                'numeric_fields': json.dumps(numeric_fields),
                'alpha_numeric_fields': json.dumps(alpha_numeric_fields),
                'geom_fields': json.dumps(geom_fields),
                'title': chart.title,
                'description': chart.description,
                'dataset_type': conf['dataset_type'],
                'x_axis_title': conf['x_axis_title'],
                'y_axis_title': conf['y_axis_title'],
                'y_axis_begin_at_zero': y_axis_begin_at_zero,
                'geographic_names_column': conf['geographic_names_column'],
                'geometries_column': conf['geometries_column'],
                'selected_columns': json.dumps(conf['columns'])
            })
def stat_exposure_effect(unfilename, tfilename, retfilename):
    """
    1小时曝光量对24小时转发数的影响
    :param tfilename: suid tweet
    :return:
    """
    tiw = datetime.timedelta(seconds=60*60)
    trw = datetime.timedelta(seconds=24*60*60)
    mid2oritweet = defaultdict(lambda: [list(), 0])  # mid: [ti_uids, tr_rtnum]  ti时刻转发数大于等于1

    with open(tfilename) as tfile:
        n_lines = 0
        for line in tfile:
            # n_lines += 1
            # if n_lines > 100000:
            #     break
            fields = get_fields(line)
            if 'rtMid' in fields:  # retweet
                rtmid = fields['rtMid']
                rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                if time - rttime < tiw:  # 0 - ti 间转发
                    uids = [int(_.split('$')[0]) for _ in fields['uid'].split('\t')]
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][0].extend(uids)
                        mid2oritweet[rtmid][1] += 1
                    else:  # not added yet
                        rtuid = int(fields['rtUid'].split('$')[0])
                        uids.append(rtuid)
                        mid2oritweet[rtmid] = [uids, 1]
                elif time - rttime < trw:  # ti - tr 间转发
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][1] += 1


    expnum_trrtnums = []

    log('Start reading user network')
    fin = snap.TFIn(unfilename)
    g = snap.TNGraph.Load(fin)

    for uids, tr_rtnum in mid2oritweet.itervalues():
        exp_nodes = set()
        rg, rrg, bg = get_rg_rrg_bg(uids, g)
        for ni in rg.Nodes():
            nid = ni.GetId()
            gni = g.GetNI(nid)
            for i in range(gni.GetInDeg()):
                nbrnid = gni.GetInNId(i)
                if not rg.IsNode(nbrnid):
                    exp_nodes.add(nbrnid)
        expnum_trrtnums.append((len(exp_nodes), tr_rtnum))

    expnum_trrtnums = set(expnum_trrtnums)

    expnum_trrtnums = sorted(expnum_trrtnums, cmp=lambda x, y: (x[0] - y[0]) or (x[1] - y[1]))
    with open(retfilename, 'w') as retfile:
        for expnum, tr_rtnum in expnum_trrtnums:
            retfile.write('%10d %10d\n' % (expnum, tr_rtnum))
def get_tweet_authors(tdirname):
    authors = defaultdict(int)
    for entry in os.listdir(tdirname):
        with open(os.path.join(tdirname, entry)) as tfile:
            firstline = tfile.readline()
            fields = get_fields(firstline)
            author = fields['uid']
            authors[author] += 1
    return authors
def stat_rtnum_curve(tfilename, retfilename):
    """
    归一化转发数曲线
    :param tfilename:
    :param retfilename:
    :return:
    """
    mid2rts = defaultdict(list)
    td = datetime.timedelta(seconds=72*60*60)

    log('Start collecting data from %s' % tfilename)
    with open(tfilename) as tfile:
        # i = 0
        for line in tfile:
            # i += 1
            # if i > 2000000:
            #     break
            fields = get_fields(line)
            if 'rtMid' not in fields:  # original tweet
                mid = fields['mid']
                time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                mid2rts[mid].append(time)
            else:
                rtmid = fields['rtMid']
                rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                if time - rttime < td:  # 72小时内
                    if rtmid in mid2rts:
                        mid2rts[rtmid].append(time)
                    else:
                        mid2rts[rtmid] = [rttime, time]

    n_effective = 0
    normalized_curve = [0.0] * (72 * 60 + 1)
    for rts in mid2rts.itervalues():
        curve = [0.0] * (72 * 60 + 1)
        if len(rts) > 1:
            n_effective += 1
            for rt in rts[1:]:
                td = rt - rts[0]
                td = (td.days * 24 * 60 * 60 + td.seconds + 60) / 60
                if td <= 4320:
                    curve[td] += 1
                else:
                    curve[-1] += 1
            for i in range(1, len(curve)):
                curve[i] += curve[i - 1]
            for i in range(len(curve)):
                curve[i] /= curve[-1]
                normalized_curve[i] += curve[i]

    for i in range(len(normalized_curve)):
        normalized_curve[i] /= n_effective

    with open(retfilename, 'w') as retfile:
        for p in normalized_curve:
            retfile.write('%.20f\n' % p)
def gen_user_network(suidfilename, rawunfilename, stdirname, m, unfilename):
    log('Start collecting short uids')
    suids = {}
    with open(suidfilename) as suidfile:
        for line in suidfile:
            uid, suid = line.strip().split()
            suids[uid] = int(suid)

    log('Start extracting user network from raw user network file')
    g = snap.TNGraph.New()
    with open(rawunfilename) as rawunfile:
        for line in rawunfile:
            follower_uid, followee_uid = line.strip().split()
            follower_suid = suids.get(follower_uid, -1)
            followee_suid = suids.get(followee_uid, -1)
            if follower_suid != -1 and not g.IsNode(follower_suid):
                g.AddNode(follower_suid)
            if followee_suid != -1 and not g.IsNode(followee_suid):
                g.AddNode(followee_suid)
            if follower_suid != -1 and followee_suid != -1 and follower_suid != followee_suid:  # 防止生成自环
                g.AddEdge(follower_suid, followee_suid)

    log('Start collecting user network from tweets')
    mention_nums = defaultdict(lambda: defaultdict(int))
    for entry in os.listdir(stdirname):
        tfilename = os.path.join(stdirname, entry)
        log('Start collecting user network from %s' % tfilename)
        with open(tfilename) as tfile:
            for line in tfile:
                fields = get_fields(line)
                for uids in [_.split('$') for _ in fields['uid'].split('\t')]:
                    uids = [int(_) for _ in uids]
                    for uid in uids[1:]:
                        mention_nums[uids[0]][uid] += 1
                if 'rtUid' in fields:
                    uids = fields['rtUid'].split('$')
                    uids = [int(_) for _ in uids]
                    for uid in uids[1:]:
                        mention_nums[uids[0]][uid] += 1

    for uid1 in mention_nums:
        for uid2, mention_num in mention_nums[uid1].items():
            if not g.IsNode(uid1):
                g.AddNode(uid1)
            if not g.IsNode(uid2):
                g.AddNode(uid2)
            if uid1 != uid2 and mention_num >= m:
                g.AddEdge(uid1, uid2)

    print '# Nodes is %d' % g.GetNodes()
    print '# Edges is %d' % g.GetEdges()

    log('Start writing user network')
    fout = snap.TFOut(unfilename)
    g.Save(fout)
    fout.Flush()
示例#12
0
def process_sam_line(line):
    if not line.startswith(comment_character):
        fields = utils.get_fields(line, min_fields)
        if fields == None:
            return None
        chromosome = fields[2]
        position = int(fields[3])
        cigar_string = parse_cigar(fields[5])
        insertions = get_insertions(cigar_string, chromosome, position)
        deletions = get_deletions(cigar_string, chromosome, position)
        return insertions + deletions
def stat_mention_effect1(tfilename, retfilename):
    """
    转发链中微博平均@人数与24小时转发数的关系
    :param tfilename: suid tweet
    :return:
    """
    tiw = datetime.timedelta(seconds=60*60)
    trw = datetime.timedelta(seconds=24*60*60)
    mid2oritweet = {}  # mid: [ti_rtnum, tr_rtnum, ti_ori_n_mentions, ti_n_mentions]

    with open(tfilename) as tfile:
        n_lines = 0
        for line in tfile:
            # n_lines += 1
            # if n_lines > 10000:
            #     break
            fields = get_fields(line)
            if 'rtMid' in fields:  # retweet
                rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                if time - rttime < tiw:  # 0 - ti 间转发
                    rtmid = fields['rtMid']
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][0] += 1
                        mid2oritweet[rtmid][1] += 1
                        mid2oritweet[rtmid][3] += len(fields['uid'].split('\t')[0].split('$')) - 1
                    else:  # not added yet
                        ti_ori_n_mentions = len(fields['rtUid'].split('$')) - 1
                        ti_n_mentions = len(fields['uid'].split('\t')[0].split('$')) - 1 + ti_ori_n_mentions
                        mid2oritweet[rtmid] = [1, 1, ti_ori_n_mentions, ti_n_mentions]
                elif time - rttime < trw:  # ti - tr 间转发
                    rtmid = fields['rtMid']
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][1] += 1
                    else:  # not added yet
                        ti_ori_n_mentions = len(fields['rtUid'].split('$')) - 1
                        mid2oritweet[rtmid] = [0, 1, ti_ori_n_mentions, 0]
            else:  # original tweet
                mid = fields['mid']
                mid2oritweet[mid] = [0, 0, len(fields['uid'].split('$')) - 1, 0]

    avg_n_mentions2tr_rtnums = defaultdict(list)
    for mid, (ti_rtnum, tr_rtnum, ti_ori_n_mentions, ti_n_mentions) in mid2oritweet.iteritems():
        ti_avg_n_mentions = ti_n_mentions / float(ti_rtnum + 1)
        avg_n_mentions2tr_rtnums[ti_avg_n_mentions].append(tr_rtnum)

    items = sorted(avg_n_mentions2tr_rtnums.items(), key=lambda item: item[0])
    with open(retfilename, 'w') as retfile:
        for ti_avg_n_mentions, tr_rtnums in items:
            avg_tr_rtnum = np.mean(tr_rtnums)
            retfile.write('%.4f %.4f\n' % (ti_avg_n_mentions, avg_tr_rtnum))
def stat_follower_effect(unfilename, tfilename, retfilename):
    """
    原作者粉丝数对24小时转发数的影响
    :param tfilename: suid tweet
    :return:
    """
    trw = datetime.timedelta(seconds=24*60*60)
    mid2oritweet = defaultdict(lambda: [0, 0])  # mid: [fo_num, tr_rtnum]

    log('Start reading user network')
    fin = snap.TFIn(unfilename)
    g = snap.TNGraph.Load(fin)

    with open(tfilename) as tfile:
        n_lines = 0
        for line in tfile:
            # n_lines += 1
            # if n_lines > 100000:
            #     break
            fields = get_fields(line)
            if 'rtMid' in fields:  # retweet
                rtmid = fields['rtMid']
                rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                if time - rttime < trw:  # 0 - tr 间转发
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][1] += 1
                    else:  # not added yet
                        rtuid = int(fields['rtUid'].split('$')[0])
                        if g.IsNode(rtuid):
                            fo_num = g.GetNI(rtuid).GetInDeg()
                        else:
                            fo_num = 0
                        mid2oritweet[rtmid] = [fo_num, 1]
            # else:  # original tweet
            #     mid = fields['mid']
            #     uid = int(fields['uid'].split('$')[0])
            #     fo_num = g.GetNI(uid).GetInDeg()
            #     mid2oritweet[mid] = [fo_num, 0]

    fo_num_tr_rtnums = list(set([tuple(_) for _ in mid2oritweet.values()]))
    fo_num_tr_rtnums.sort(cmp=lambda x, y: (x[0] - y[0]) or (x[1] - y[1]))

    with open(retfilename, 'w') as retfile:
        for fo_num, tr_rtnum in fo_num_tr_rtnums:
            retfile.write('%10d %10d\n' % (fo_num, tr_rtnum))
def stat_avg_tweet_num_per_hour(tfilename, retfilename, days):
    pt2tnum = defaultdict(int)
    ts = 60 * 60

    log('Start collecting data from %s' % tfilename)
    with open(tfilename) as tfile:
        for line in tfile:
            fields = get_fields(line)
            time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
            secs = time.hour * 60 * 60 + time.minute * 60 + time.second
            pt = secs / ts
            pt2tnum[pt] += 1

    with open(retfilename, 'w') as retfile:
        for pt in sorted(pt2tnum.keys()):
            avgtnum = pt2tnum[pt] / float(days)
            retfile.write('%d\t%.2f\n' % (pt, avgtnum))
def stat_tirtnum_effect(tfilename, retfilename):
    """
    :param tfilename: suid tweet
    :return:
    """
    tiw = datetime.timedelta(seconds=60*60)
    trw = datetime.timedelta(seconds=24*60*60)
    mid2oritweet = {}  # mid: [ti_rtnum, tr_rtnum]

    with open(tfilename) as tfile:
        n_lines = 0
        for line in tfile:
            # n_lines += 1
            # if n_lines > 10000:
            #     break
            fields = get_fields(line)
            if 'rtMid' in fields:  # retweet
                rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                if time - rttime < tiw:  # 0 - ti 间转发
                    rtmid = fields['rtMid']
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][0] += 1
                        mid2oritweet[rtmid][1] += 1
                    else:  # not added yet
                        mid2oritweet[rtmid] = [1, 1]
                elif time - rttime < trw:  # ti - tr 间转发
                    rtmid = fields['rtMid']
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][1] += 1
                    else:  # not added yet
                        mid2oritweet[rtmid] = [0, 1]
            else:  # original tweet
                mid = fields['mid']
                mid2oritweet[mid] = [0, 0]

    ti_rtnum2tr_rtnums = defaultdict(list)
    for mid, (ti_rtnum, tr_rtnum) in mid2oritweet.iteritems():
        ti_rtnum2tr_rtnums[ti_rtnum].append(tr_rtnum)

    items = sorted(ti_rtnum2tr_rtnums.items(), key=lambda item: item[0])
    with open(retfilename, 'w') as retfile:
        for ti_rtnum, tr_rtnums in items:
            avg_tr_rtnum = np.mean(tr_rtnums)
            retfile.write('%.4f %.4f\n' % (ti_rtnum, avg_tr_rtnum))
def stat_time_interval_effect2(tfilename, retfilename):
    """
    微博1小时内平均转发时间间隔与24小时转发数的关系
    :param tfilename: suid tweet
    :return:
    """
    tiw = datetime.timedelta(seconds=60*60)
    trw = datetime.timedelta(seconds=24*60*60)
    mid2oritweet = {}  # mid: [time_interval, tr_rtnum]

    with open(tfilename) as tfile:
        n_lines = 0
        for line in tfile:
            # n_lines += 1
            # if n_lines > 10000:
            #     break
            fields = get_fields(line)
            if 'rtMid' in fields:  # retweet
                rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                td = time - rttime
                if td < tiw:
                    rtmid = fields['rtMid']
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][0] = min(mid2oritweet[rtmid][0], td.seconds)
                        mid2oritweet[rtmid][1] += 1
                    else:  # not added yet
                        td = td.seconds
                        mid2oritweet[rtmid] = [td, 1]
                elif td < trw:  # 0 - tr 间转发
                    rtmid = fields['rtMid']
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][1] += 1
            else:  # original tweet
                pass

    deduped = set()
    for mid, value in mid2oritweet.iteritems():
        deduped.add(tuple(value))

    items = sorted(deduped, cmp=lambda x, y: (x[0] - y[0]) or (x[1] - y[1]))
    with open(retfilename, 'w') as retfile:
        for time_interval, tr_rtnum in items:
            retfile.write('%d %.4f\n' % (time_interval, tr_rtnum))
示例#18
0
def piechart_add(request, layer_id):
    if request.method == 'POST':
        layer = Layer.objects.get(id=int(layer_id))

        title = request.POST.get('title')
        description = request.POST.get('description')
        chart_conf = request.POST.get('chart_conf')

        chart = Chart(layer=layer,
                      type='piechart',
                      title=title,
                      description=description,
                      conf=chart_conf)
        chart.save()

        return HttpResponse(json.dumps({'success': True}, indent=4),
                            content_type='application/json')

    else:
        layer = Layer.objects.get(id=int(layer_id))

        layer = Layer.objects.get(id=int(layer_id))
        datastore = Datastore.objects.get(id=layer.datastore_id)
        workspace = Workspace.objects.get(id=datastore.workspace_id)
        gs = geographic_servers.get_instance().get_server_by_id(
            workspace.server.id)

        (ds_type, resource) = gs.getResourceInfo(workspace.name, datastore,
                                                 layer.name, "json")
        fields = utils.get_fields(resource)
        numeric_fields = utils.get_numeric_fields(fields)
        alpha_numeric_fields = utils.get_alphanumeric_fields(fields)
        geom_fields = utils.get_geometry_fields(fields)

        conf = {
            'layer_id': layer_id,
            'fields': json.dumps(fields),
            'numeric_fields': json.dumps(numeric_fields),
            'alpha_numeric_fields': json.dumps(alpha_numeric_fields),
            'geom_fields': json.dumps(geom_fields)
        }

        return render(request, 'piechart_add.html', conf)
示例#19
0
def process_vcf_line(line):
    if not line.startswith(comment_character):
        fields = utils.get_fields(line, min_fields)
        if fields == None:
            return None
        chromosome = fields[0]
        position = int(fields[1])
        type = None
        length = None

        reference_bases = fields[3]
        alternative_bases = fields[4]
        if len(alternative_bases) > len(reference_bases):  # Insertion
            type = utils.insertion_type
            length = len(alternative_bases) - len(reference_bases)
        elif len(reference_bases) > len(alternative_bases):  # Deletion
            type = utils.deletion_type
            length = len(reference_bases) - len(alternative_bases)
        else:
            return None
        return [[chromosome, position, type, length]]  # for compatibility
def stat_mention_effect2(tfilename, retfilename):
    """
    原始微博@人数与24小时转发数的关系
    :param tfilename: suid tweet
    :return:
    """
    tiw = datetime.timedelta(seconds=60*60)
    trw = datetime.timedelta(seconds=24*60*60)
    mid2oritweet = {}  # mid: [n_mentions, tr_rtnum]

    with open(tfilename) as tfile:
        n_lines = 0
        for line in tfile:
            # n_lines += 1
            # if n_lines > 10000:
            #     break
            fields = get_fields(line)
            if 'rtMid' in fields:  # retweet
                rttime = datetime.datetime.strptime(fields['rtTime'], '%Y-%m-%d %H:%M:%S')
                time = datetime.datetime.strptime(fields['time'], '%Y-%m-%d %H:%M:%S')
                if time - rttime < trw:  # 0 - tr 间转发
                    rtmid = fields['rtMid']
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][1] += 1
                    else:  # not added yet
                        n_mentions = len(fields['rtUid'].split('$')) - 1
                        mid2oritweet[rtmid] = [n_mentions, 1]
            else:  # original tweet
                mid = fields['mid']
                n_mentions = len(fields['uid'].split('$')) - 1
                mid2oritweet[mid] = [n_mentions, 0]

    deduped = set()
    for mid, value in mid2oritweet.iteritems():
        deduped.add(tuple(value))

    items = sorted(deduped, cmp=lambda x, y: (x[0] - y[0]) or (x[1] - y[1]))
    with open(retfilename, 'w') as retfile:
        for n_mentions, tr_rtnum in items:
            retfile.write('%d %.4f\n' % (n_mentions, tr_rtnum))
示例#21
0
 def __getitem__(self, idx):
     # returns sequence of n_lag values and 1 value after (which is to be predicted)
     fields = get_fields(self.paths[idx], ['closing_prices', 'volume'])
     prices = torch.Tensor(fields['closing_prices'])
     volume = torch.Tensor(fields['volume'])
     n_values = fields['n_values']
     
     def get_io(n_lag):
         if n_values-n_lag < 1:
             return None, None, None, None
         np.random.seed()
         #start_idx = 100*np.random.randint(2) # TODO test only
         start_idx = np.random.randint(0, n_values-n_lag)
         stop_idx  = start_idx + n_lag
         return prices[start_idx:stop_idx], prices[stop_idx], volume[start_idx:stop_idx], volume[stop_idx]
     
     last_prices, next_price, last_volumes, next_volume = get_io(self.n_lag)
     
     sample = {'last_prices': last_prices, 'next_price':next_price, \
               'last_volumes': last_volumes, 'next_volume' : next_volume}
     
     return sample
示例#22
0
def get_conf(request, layer_id):
    layer = Layer.objects.get(id=int(layer_id))
    datastore = Datastore.objects.get(id=layer.datastore_id)
    workspace = Workspace.objects.get(id=datastore.workspace_id)

    index = len(StyleLayer.objects.filter(layer=layer))
    styleLayers = StyleLayer.objects.filter(layer=layer)
    for style_layer in styleLayers:
        aux_name = style_layer.style.name
        aux_name = aux_name.replace(workspace.name + '_' + layer.name + '_',
                                    '')

        try:
            aux_index = int(aux_name)
            if index < aux_index + 1:
                index = aux_index + 1
        except ValueError:
            print "Error getting index"

    (ds_type, resource) = mapservice.getResourceInfo(workspace.name, datastore,
                                                     layer.name, "json")
    fields = utils.get_fields(resource)
    if layer.conf:
        new_fields = []
        conf = None
        if layer and layer.conf:
            conf = ast.literal_eval(layer.conf)
        for field in fields:
            if conf:
                for f in conf['fields']:
                    if f['name'] == field['name']:
                        for id, language in settings.LANGUAGES:
                            field['title-' + id] = f['title-' + id]
            else:
                for id, language in settings.LANGUAGES:
                    field['title-' + id] = field['name']
            new_fields.append(field)
        fields = new_fields

    feature_type = utils.get_feature_type(fields)
    alphanumeric_fields = utils.get_alphanumeric_fields(fields)

    supported_fonts_str = mapservice.getSupportedFonts()
    supported_fonts = json.loads(supported_fonts_str)
    sorted_fonts = utils.sortFontsArray(supported_fonts.get("fonts"))

    layer_url = core_utils.get_wms_url(request, workspace)
    layer_wfs_url = core_utils.get_wfs_url(request, workspace)

    preview_url = ''
    if feature_type == 'PointSymbolizer':
        preview_url = settings.GVSIGOL_SERVICES[
            'URL'] + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_point'
    elif feature_type == 'LineSymbolizer':
        preview_url = settings.GVSIGOL_SERVICES[
            'URL'] + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_line'
    elif feature_type == 'PolygonSymbolizer':
        preview_url = settings.GVSIGOL_SERVICES[
            'URL'] + '/wms?REQUEST=GetLegendGraphic&VERSION=1.0.0&FORMAT=image/png&WIDTH=20&HEIGHT=20&LAYER=preview_polygon'

    conf = {
        'featureType': feature_type,
        'fields': alphanumeric_fields,
        'json_alphanumeric_fields': json.dumps(alphanumeric_fields),
        'fonts': sorted_fonts,
        'layer_id': layer_id,
        'layer_url': layer_url,
        'layer_wfs_url': layer_wfs_url,
        'layer_name': workspace.name + ':' + layer.name,
        'style_name': workspace.name + '_' + layer.name + '_' + str(index),
        'libraries': Library.objects.all(),
        'supported_crs': json.dumps(core_utils.get_supported_crs()),
        'preview_url': preview_url
    }

    return conf
def stat_link_pop(tfilename, retfilename):
    """
    :param tfilename: suid tweet
    :return:
    """
    tiw = datetime.timedelta(seconds=24 * 60 * 60)
    n_tweets = 0
    n_tweets_with_link = 0
    mid2oritweet = {}  # mid: [pop, with_link]

    with open(tfilename) as tfile:
        for line in tfile:
            fields = get_fields(line)
            if "rtMid" in fields:  # retweet
                rttime = datetime.datetime.strptime(fields["rtTime"], "%Y-%m-%d %H:%M:%S")
                time = datetime.datetime.strptime(fields["time"], "%Y-%m-%d %H:%M:%S")
                if time - rttime >= tiw:  # 超过24小时,不作处理
                    continue

                rtmid = fields["rtMid"]
                if rtmid in mid2oritweet:
                    mid2oritweet[rtmid][0] += 1
                else:
                    if fields["rtIsContainLink"] == "true":
                        mid2oritweet[rtmid] = [1, True]
                        n_tweets_with_link += 1
                    else:
                        mid2oritweet[rtmid] = [1, False]
                    n_tweets += 1

                if fields["isContainLink"] == "true":
                    n_tweets_with_link += 1

            else:  # original tweet
                mid = fields["mid"]
                if fields["isContainLink"] == "true":
                    mid2oritweet[mid] = [0, True]
                else:
                    mid2oritweet[mid] = [0, False]
            n_tweets += 1

    n_oritweets_without_link = 0
    n_oritweets_with_link = 0
    pops_oritweets_without_link = []
    pops_oritweets_with_link = []
    for mid, (pop, iscontainlink) in mid2oritweet.iteritems():
        if iscontainlink:
            n_oritweets_with_link += 1
            pops_oritweets_with_link.append(pop)
        else:
            n_oritweets_without_link += 1
            pops_oritweets_without_link.append(pop)
    mid_pop_oritweets_with_link = sorted(pops_oritweets_with_link)[len(pops_oritweets_with_link) / 2]
    mid_pop_oritweets_without_link = sorted(pops_oritweets_without_link)[len(pops_oritweets_without_link) / 2]

    with open(retfilename, "w") as retfile:
        retfile.write("n_tweets: %d\n" % n_tweets)
        retfile.write("n_tweets_with_link: %d\n" % n_tweets_with_link)
        retfile.write("n_original_tweets: %d\n" % (n_oritweets_with_link + n_oritweets_without_link))
        retfile.write("n_original_tweets_with_link: %d\n" % n_oritweets_with_link)
        retfile.write("n_original_tweets_without_link: %d\n" % n_oritweets_without_link)
        retfile.write("mid_pop_oritweets_with_link: %f\n" % mid_pop_oritweets_with_link)
        retfile.write("mid_pop_oritweets_without_link: %f\n" % mid_pop_oritweets_without_link)
def stat_url_effect(tfilename, retfilename):
    """
    包含链接和不包含链接的微博的被转发情况
    :param tfilename: suid tweet
    :return:
    """
    tiw = datetime.timedelta(seconds=60 * 60)
    trw = datetime.timedelta(seconds=24 * 60 * 60)
    mid2oritweet = {}  # mid: [ti_pop, tr_pop, ti_n_tweet_with_link, ti_ori_tweet_with_link]

    with open(tfilename) as tfile:
        n_lines = 0
        for line in tfile:
            # n_lines += 1
            # if n_lines > 10000:
            #     break
            fields = get_fields(line)
            if "rtMid" in fields:  # retweet
                rttime = datetime.datetime.strptime(fields["rtTime"], "%Y-%m-%d %H:%M:%S")
                time = datetime.datetime.strptime(fields["time"], "%Y-%m-%d %H:%M:%S")
                td = time - rttime
                if td < tiw:  # 0 - ti 间转发
                    rtmid = fields["rtMid"]
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][0] += 1
                        mid2oritweet[rtmid][1] += 1
                        if fields["isContainLink"] == "true":
                            mid2oritweet[rtmid][2] += 1
                    else:  # not added yet
                        if fields["rtIsContainLink"] == "true" and fields["isContainLink"] == "true":
                            mid2oritweet[rtmid] = [1, 1, 2, 1]
                        elif fields["rtIsContainLink"] == "true" and fields["isContainLink"] == "false":
                            mid2oritweet[rtmid] = [1, 1, 1, 1]
                        elif fields["rtIsContainLink"] == "false" and fields["isContainLink"] == "true":
                            mid2oritweet[rtmid] = [1, 1, 1, 0]
                        else:
                            mid2oritweet[rtmid] = [1, 1, 0, 0]
                elif td < trw:  # ti - tr 间转发
                    rtmid = fields["rtMid"]
                    if rtmid in mid2oritweet:  # already added
                        mid2oritweet[rtmid][1] += 1
                    else:  # not added yet
                        if fields["rtIsContainLink"] == "true":
                            mid2oritweet[rtmid] = [0, 1, 1, 1]
                        else:
                            mid2oritweet[rtmid] = [0, 1, 0, 0]
                else:  # tr 之后转发
                    rtmid = fields["rtMid"]
                    if rtmid in mid2oritweet:  # already added
                        pass
                    else:  # not added yet
                        if fields["rtIsContainLink"] == "true":
                            mid2oritweet[rtmid] = [0, 0, 0, 1]
                        else:
                            mid2oritweet[rtmid] = [0, 0, 0, 0]
            else:  # original tweet
                mid = fields["mid"]
                if fields["isContainLink"] == "true":
                    mid2oritweet[mid] = [0, 0, 1, 1]
                else:
                    mid2oritweet[mid] = [0, 0, 0, 0]

    ratio2tr_pops = defaultdict(list)
    for mid, (ti_pop, tr_pop, ti_n_link, ti_ori_with_link) in mid2oritweet.iteritems():
        ratio = ti_n_link / float(ti_pop + 1)
        ratio2tr_pops[ratio].append(tr_pop)

    n_ori_tweets_with_link = sum([mid2oritweet[mid][3] for mid in mid2oritweet])
    n_ori_tweets_with_link_retweeted = sum([mid2oritweet[mid][3] for mid in mid2oritweet if mid2oritweet[mid][1] > 0])
    n_retweets_of_ori_tweets_with_link = sum(
        [mid2oritweet[mid][1] for mid in mid2oritweet if mid2oritweet[mid][3] == 1]
    )
    n_ori_tweets_without_link_retweeted = sum(
        [1 for mid in mid2oritweet if mid2oritweet[mid][1] > 0 and mid2oritweet[mid][3] == 0]
    )
    n_retweets_of_ori_tweets_without_link = sum(
        [mid2oritweet[mid][1] for mid in mid2oritweet if mid2oritweet[mid][3] == 0]
    )
    n_ori_tweets_without_link = len(mid2oritweet) - n_ori_tweets_with_link

    items = sorted(ratio2tr_pops.items(), key=lambda item: item[0])
    with open(retfilename, "w") as retfile:
        for ratio, tr_pops in items:
            avg_tr_pop = np.mean(tr_pops)
            retfile.write("%.4f    %.4f\n" % (ratio, avg_tr_pop))
        retfile.write("\n" + "=" * 20 + "\n")

        retfile.write("n_ori_tweets_with_link: %d\n" % n_ori_tweets_with_link)
        retfile.write("n_ori_tweets_with_link_retweeted: %d\n" % n_ori_tweets_with_link_retweeted)
        retfile.write("n_retweets_of_ori_tweets_with_link: %d\n" % n_retweets_of_ori_tweets_with_link)
        retfile.write(
            "retweet_ratio_for_ori_tweets_with_link: %.8f\n"
            % (float(n_ori_tweets_with_link_retweeted) / n_ori_tweets_with_link)
        )
        retfile.write(
            "avg_n_retweet_of_ori_tweets_with_link: %.8f\n"
            % (float(n_retweets_of_ori_tweets_with_link) / n_ori_tweets_with_link)
        )
        retfile.write("n_ori_tweets_without_link: %d\n" % n_ori_tweets_without_link)
        retfile.write("n_ori_tweets_without_link_retweeted: %d\n" % n_ori_tweets_without_link_retweeted)
        retfile.write("n_retweets_of_ori_tweets_without_link: %d\n" % n_retweets_of_ori_tweets_without_link)
        retfile.write(
            "retweet_ratio_for_ori_tweets_without_link: %.8f\n"
            % (float(n_ori_tweets_without_link_retweeted) / n_ori_tweets_without_link)
        )
        retfile.write(
            "avg_n_retweet_of_ori_tweets_without_link: %.8f\n"
            % (float(n_retweets_of_ori_tweets_without_link) / n_ori_tweets_without_link)
        )