예제 #1
0
파일: behavior.py 프로젝트: hitalex/crawler
def get_behavior_statics(uid_list, topic_dict):
    """ 获取用户的行为信息
    """
    print '获取用户的行为信息...'
    behavior = dict()
    for uid in uid_list:
        behavior[uid] = [0, 0, ''] # [num_topics, num_comments, related-content]
        
    # 浏览所有的topic
    print '浏览所有的topic信息...'
    for topic_id in topic_dict:
        topic = topic_dict[topic_id]
        if topic['user_id'] in behavior:
            uid = topic['user_id']
            behavior[uid][0] += 1
            behavior[uid][2] += (topic['title'] + ' ' + topic['content'])
            
    print '浏览所有的留言信息...'
    f = open(COMMENT_ALL_FILE_PATH, 'r')
    # 保证本group的comment信息只抽取一遍
    if os.path.exists(COMMENT_FILE_PATH):
        fc = None
    else:
        fc = open(COMMENT_FILE_PATH, 'w') # 存储本group的comment
    row = ''
    for line in f:
        line = line.strip()
        if line != '[*ROWEND*]':
            row += line
            continue
        seg_list = row.split('[=]')
        #print 'Processing comment id: %s, group id: %s, topic id: %s' % (seg_list[0], seg_list[1], seg_list[2])
        pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S")
        topic_id = seg_list[2] # 保证评论所在的topic被收录
        if seg_list[1] != GROUP_ID or (not topic_id in topic_dict):
            row = ''
            continue
        
        if fc != None and (is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE) or is_between(pubdate, TEST_START_DATE, TEST_END_DATE)):
            fc.write(row + '\n[*ROWEND*]\n')
        
        if is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE):
            uid = seg_list[3]
            if uid in behavior:
                behavior[uid][1] += 1
                # 如果用户评论了某个帖子,则认为用户对这个帖子有兴趣
                # 在这里,将帖子的标题和内容,都加入用户的感兴趣的内容中
                topic_id = seg_list[2]
                topic = topic_dict[topic_id]
                # 这里并没有包括引用的评论的内容
                behavior[uid][2] += (topic['title'] + ' ' + topic['content'] + ' ' + seg_list[6])
                
        row = ''
    
    f.close()
    if not fc is None:
        fc.close()
    
    return behavior
예제 #2
0
def score_inheritance(val: Optional[Union[int, float]]) -> str:
    if val is None:
        color = 'white'
    elif is_between(val, 0, 3):
        color = 'green'
    elif is_between(val, 3, 6):
        color = 'yellow'
    else:
        color = 'red'

    return color
예제 #3
0
def score_c2c(val: Optional[Union[int, float]]) -> str:
    if val is None:
        color = 'white'
    elif is_between(val, 0, 0.44):
        color = 'red'
    elif is_between(val, 0.44, 0.55):
        color = 'yellow'
    else:
        color = 'green'

    return color
예제 #4
0
def score_cohesion(val: Optional[Union[int, float]]) -> str:
    if val is None:
        color = 'white'
    elif is_between(val, 0, 33):
        color = 'green'
    elif is_between(val, 33, 67):
        color = 'yellow'
    else:
        color = 'red'

    return color
예제 #5
0
def score_nesting(val: Optional[Union[int, float]]) -> str:
    if val is None:
        color = 'white'
    elif is_between(val, 0, 2):
        color = 'green'
    elif is_between(val, 2, 5):
        color = 'yellow'
    else:
        color = 'red'

    return color
예제 #6
0
def score_essential(val: Optional[Union[int, float]]) -> str:
    if val is None:
        color = 'white'
    elif is_between(val, 0, 4):
        color = 'green'
    elif is_between(val, 4, 10):
        color = 'yellow'
    else:
        color = 'red'

    return color
예제 #7
0
def score_sloc(val: Optional[Union[int, float]]) -> str:
    if val is None:
        color = 'white'
    elif is_between(val, 0, 100):
        color = 'green'
    elif is_between(val, 100, 200):
        color = 'yellow'
    elif is_between(val, 200, 500):
        color = 'red'
    else:
        color = 'red'

    return color
예제 #8
0
def score_cyclo(val: Optional[Union[int, float]]) -> str:
    if val is None:
        return 'white'
    elif is_between(val, 0, 4):
        return 'green'
    elif is_between(val, 4, 10):
        return 'yellow'
    elif is_between(val, 10, 20):
        return 'yellow'
    elif is_between(val, 20, 50):
        return 'red'
    else:
        return 'red'
예제 #9
0
def cyclo_to_bad_fix(val: Optional[Union[int, float]]) -> Optional[int]:
    if val is None:
        return None
    elif is_between(val, 1, 10):
        return 5
    elif is_between(val, 10, 20):
        return 10
    elif is_between(val, 20, 30):
        return 20
    elif is_between(val, 30, 50):
        return 30
    elif is_between(val, 50, 50):
        return 40
    else:
        return 60
예제 #10
0
def get_interested_topic(uid_list, comment_path):
    """ 从comment info中获取用户感兴趣的topic list列表(即评论的某个topic的id列表 ),
    并统计用户的评论次数
    """
    user_set = set(uid_list)
    # user id ==> (num_comments, num_topics)
    behavior = dict()
    # 初始化behavior
    # 注意:有可能interest_info和behavior并不包含所有的uid,不过都为其设置了初始值
    for uid in uid_list:
        behavior[uid] = [0, 0]
        
    f = codecs.open(comment_path, 'r', 'utf-8')
    for line in f:
        line = line.strip()
        seg_list = line.split('[=]')
        uid = seg_list[3]
        pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S")
        
        if uid in behavior and is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE):
            # 如果某个用户在训练时间内没有发表帖子或者评论,其设置为0
            behavior[uid][0] += 1 # 参与评论数加1
            
    f.close()

    return behavior
예제 #11
0
def score_bad_fix(_val: Optional[Union[int, float]]) -> str:
    val = cyclo_to_bad_fix(_val)
    if val is None:
        return 'white'
    elif is_between(val, 0, 10):
        return 'green'
    elif is_between(val, 10, 20):
        return 'green'
    elif is_between(val, 20, 30):
        return 'yellow'
    elif is_between(val, 30, 50):
        return 'yellow'
    elif is_between(val, 50, 80):
        return 'red'
    else:
        return 'red'
예제 #12
0
 def check_angular_position(self, name, bbox):
     angle = utils.get_angle(bbox, self.cur_img.shape)
     l = len(self.angular_order)
     ang_order = self.angular_order * 2
     idx = ang_order.index(name)
     prev = self.angular_order[(idx - 1) % l]
     next = self.angular_order[(idx + 1) % l]
     angles_dict = utils.get_bbox_dict_ang_pos(self.tmp_track,
                                               self.cur_img.shape)
     start, end = angles_dict[prev], angles_dict[next]
     return utils.is_between(start, end, angle)
예제 #13
0
    def get_bbox_between_id(self, id1, id2, bbox_fd_list):
        a1 = utils.get_angle(self.tmp_track[id1][config.BBOX_KEY],
                             self.cur_img.shape)
        a2 = utils.get_angle(self.tmp_track[id2][config.BBOX_KEY],
                             self.cur_img.shape)
        bbox_list = [
            i for i in bbox_fd_list
            if utils.is_between(a1, a2, utils.get_angle(i, self.cur_img.shape))
        ]
        angle_list = []

        for bbox_fd in bbox_fd_list:
            angle = utils.get_angle(bbox_fd, self.cur_img.shape)
            if utils.is_between(a1, a2, angle):
                bbox_list.append(bbox_fd)
                angle_list.append(angle)
                zipped_pairs = zip(angle_list, bbox_list)
                zipped_pairs = sorted(zipped_pairs)
                bbox_list = [x for _, x in zipped_pairs]
                angle_list = [x for x, _ in zipped_pairs]

        return bbox_list, angle_list
예제 #14
0
def gen_interest_text(uid_list, behavior, topic_path):
    """ 根据用户感兴趣的topic list,从TopicInfo中找到用户感兴趣的文本
    """
    f = codecs.open(topic_path, 'r', 'utf-8')
    for line in f:
        line = line.strip()
        seg_list = line.split('[=]')
        topic_id = seg_list[0]
        uid = seg_list[2]
        pubdate = datetime.strptime(seg_list[3], "%Y-%m-%d %H:%M:%S")
        # 统计发表的topic数
        if uid in behavior and is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE):
            behavior[uid][1] += 1
        
    f.close()
예제 #15
0
파일: prepare.py 프로젝트: hitalex/crawler
def load_topic_user(filepath, start_date = VERY_EARLY_TIME, end_date = VERY_LATE_TIME):
    """ 根据时间范围,导入所有的topic以及参与的user id
    注意:topic可能有commentlist或者没有
    """
    print 'Loading topic from %s' % filepath
    f = codecs.open(filepath, 'r', 'utf-8')
    # map topic_id --> dict()
    topic_dict = dict()
    user_set = set()
    count = 0
    for line in f:
        line = line.strip()
        seg_list = line.split('[=]')
        if len(seg_list) < 6:
            log.info('Bad formatted topic: %s' % line)
            count += 1
            continue
        #print 'Processing topic id: %s, group id: %s' % (seg_list[0], seg_list[1])
        pubdate = datetime.strptime(seg_list[3], "%Y-%m-%d %H:%M:%S")
        if not is_between(pubdate, start_date, end_date):
            continue
        # 记录下该topic信息
        topic = dict()
        topic['topic_id'] = seg_list[0]
        topic['group_id'] = seg_list[1]
        topic['user_id'] = seg_list[2]
        topic['pubdate'] = pubdate
        topic['title'] = seg_list[4]
        topic['content'] = seg_list[5]
        user_set.add(topic['user_id'])
        # 去掉最后的逗号
        if len(seg_list) == 7: # 如果包含comment_list
            s = seg_list[6]
            if s != ''  and s[-1] == ',':
                seg_list[6] = s[0:-1]
            topic['comment_list'] = seg_list[6]
        else:
            topic['comment_list'] = ''
        
        topic_dict[topic['topic_id']] = topic
        #print "Loaded topic: " + topic[topic_id]
        
    log.info('Number of bad formatted topic: %d' % count)
        
    f.close()
    
    return topic_dict, user_set
예제 #16
0
파일: prepare.py 프로젝트: hitalex/crawler
def load_comment_user(filepath, topic_dict, start_date = VERY_EARLY_TIME, end_date = VERY_LATE_TIME):
    """ 根据时间范围,导入所有的评论id,tpic id和内容
    注意:在这里仍然需要topic_dict,因为只有在topic_dict中的comment才会被收集
    """
    print 'Loading comment from %s' % filepath
    f = codecs.open(filepath, 'r', 'utf-8')
    comment_dict = dict()
    user_set = set()
    count = 0
    for line in f:
        line = line.strip()
        seg_list = line.split('[=]')
        if len(seg_list) != 7:
            log.info('Bad formatted comment: %s' % line)
            count += 1
            continue
        #print 'Processing comment id: %s, group id: %s, topic id: %s' % (seg_list[0], seg_list[1], seg_list[2])
        pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S")
        topic_id = seg_list[2]
        if topic_id in topic_dict and is_between(pubdate, start_date, end_date):
            pass
        else:
            continue
            
        comment = dict()
        comment['comment_id'] = seg_list[0]
        comment['group_id'] = seg_list[1]
        comment['topic_id'] = seg_list[2]
        comment['user_id'] = seg_list[3]
        user_set.add(comment['user_id'])
        pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S")
        comment['pubdate'] = pubdate
        comment['ref_comment_id'] = seg_list[5]
        comment['content'] = seg_list[6]
        
        comment_dict[comment['comment_id']] = comment
        
    log.info('Number of bad formatted comment: %d' % count)
    return comment_dict, user_set
예제 #17
0
 def move(self, colliders):
     rem_vel_x = self.velocity_x
     rem_vel_y = self.velocity_y
     while rem_vel_x or rem_vel_y:  # while the ball can still move
         target_pos = Vector(rem_vel_x, rem_vel_y) + self.center
         for collider in colliders:
             point1, point2 = collider
             bounce_pos = utils.find_intersection(*self.center, *target_pos, *point1, *point2)
             if bounce_pos is None:
                 continue  # Will never collide unless angle changes
             distance_bounce = utils.distance(*self.center, *bounce_pos) - self.radius
             distance_target = utils.distance(*self.center, *target_pos)
             if distance_bounce > distance_target:
                 continue  # Did not collide yet
             if not utils.is_between(*collider, bounce_pos):
                 continue # Moves past collider
             break
         else:  # Did not collide with any collider -> free to move
             self.center_x += rem_vel_x * self.mod
             self.center_y += rem_vel_y * self.mod
             break
         dist_x = utils.to_zero(bounce_pos[0] - self.center_x, rem_vel_x)
         dist_y = utils.to_zero(bounce_pos[1] - self.center_y, rem_vel_y)
         rem_vel_x -= dist_x
         rem_vel_y -= dist_y
         if collider[0][0] == collider[1][0]:  # collider is vertical
             dist_x = -dist_x
             rem_vel_x = -rem_vel_x
             self.velocity_x = -self.velocity_x
         elif collider[0][1] == collider[1][1]:  # collider is horizontal
             dist_y = -dist_y
             rem_vel_y = -rem_vel_y
             self.velocity_y = -self.velocity_y
         else:
             raise ValueError("Collider", collider, "has to be a straight line")
         self.center_x += dist_x * self.mod
         self.center_y += dist_y * self.mod
         self.mod += .1
예제 #18
0
    def correct_faces_by_proximity(self, bbox_fd_list, score_fd_list,
                                   corrected_bbox):
        if len(bbox_fd_list) == 0:
            return [], []

        bbox_fd_list = [
            bbox_fd_list[idx] for idx, score in enumerate(score_fd_list)
            if score > config.face_detection_angle_trh
        ]
        score_fd_list = [
            score for score in score_fd_list
            if score > config.face_detection_angle_trh
        ]

        indices = []

        # Get the angles of the sure bboxes ordered in ---> corrected_bbox_angles
        corrected_bbox_angles_tmp = utils.get_bbox_dict_ang_pos(
            corrected_bbox, self.cur_img.shape)
        corrected_bbox_angles = {}
        verified_names = []
        for name in self.angular_order:
            if name in corrected_bbox_angles_tmp.keys():
                corrected_bbox_angles[name] = corrected_bbox_angles_tmp[name]
                verified_names.append(name)

        not_corrected_bbox_angles = {
            k: utils.get_angle(v[config.BBOX_KEY], self.cur_img.shape)
            for k, v in self.tmp_track.items() if k not in corrected_bbox
        }
        if not not_corrected_bbox_angles:
            return [], []

        bbox_fd_angles = [
            utils.get_angle(bbox_fd, self.cur_img.shape)
            for bbox_fd in bbox_fd_list
        ]

        tmp_order = {}
        l = len(verified_names)
        for idx, angle in enumerate(bbox_fd_angles):
            for i in range(l):
                if utils.is_between(
                        corrected_bbox_angles[verified_names[i]],
                        corrected_bbox_angles[verified_names[(i + 1) % l]],
                        angle):
                    tmp_order[idx] = (verified_names[i],
                                      verified_names[(i + 1) % l])
                    break

        for idx, bbox_fd in enumerate(bbox_fd_list):
            angle = utils.get_angle(bbox_fd, self.cur_img.shape)
            if corrected_bbox_angles:
                prev_id, next_id = None, None
                for name, value in corrected_bbox_angles.items():
                    if angle > value:
                        prev_id = name
                        break
                for name, value in corrected_bbox_angles.items():
                    if angle < value:
                        next_id = name
                        break
                if prev_id is None:
                    prev_id = list(corrected_bbox_angles.keys())[-1]
                if next_id is None:
                    next_id = list(corrected_bbox_angles.keys())[0]
                ang_order = self.angular_order * 2
                start = ang_order.index(prev_id)
                end = ang_order.index(next_id, start + 1)
                potential_id_list = [i for i in ang_order[start + 1:end]]
            else:
                potential_id_list = self.angular_order

            if self.check_angle_proximity(angle, corrected_bbox_angles):
                name = None
                if len(potential_id_list) == 1 and self.check_angular_position(
                        potential_id_list[0], bbox_fd):
                    name = potential_id_list[0]
                elif len(potential_id_list) == 0:
                    continue
                else:
                    key, value = min(not_corrected_bbox_angles.items(),
                                     key=lambda kv: abs(kv[1] - angle))
                    if self.check_angular_position(key, bbox_fd):
                        name = key

                if name is not None:
                    self.correct_tracker(name, bbox_fd, True)
                    indices.append(idx)
                    logging.info(
                        "Assigned {} to children {} by closest angular position"
                        .format(bbox_fd, name))

        bbox_fd_list = [
            i for j, i in enumerate(bbox_fd_list) if j not in indices
        ]
        score_fd_list = [
            i for j, i in enumerate(score_fd_list) if j not in indices
        ]
        return bbox_fd_list, score_fd_list
예제 #19
0
def main(argv):
    if len(argv) < 2:
        print 'Group ID not provided.'
        sys.exit(1)
        
    group_id = argv[1]
    log.info('Prepare training set and test set for group: %s' % group_id)
    
    path = 'tables/' + group_id + '/TopicInfo-raw-all-' + group_id
    topic_dict, topic_user_set = load_topic_user(path, TRAIN_START_DATE, TEST_END_DATE) # 取出所有topic
    print 'Number of topics loaded: %d (From %s to %s)' % (len(topic_dict), str(TRAIN_START_DATE), str(TEST_END_DATE))
    log.info('Number of topics loaded: %d (From %s to %s)' % (len(topic_dict), str(TRAIN_START_DATE), str(TEST_END_DATE)))
    
    path = 'tables/' + group_id + '/CommentInfo-raw-all-' + group_id
    comment_dict, comment_user_set = load_comment_user(path, topic_dict, TRAIN_START_DATE, COMMENT_END_DATE)
    print 'Number of comments loaded: %d (From %s to %s))' % (len(comment_dict), str(TRAIN_START_DATE), str(COMMENT_END_DATE))
    log.info('Number of comments loaded: %d (From %s to %s))' % (len(comment_dict), str(TRAIN_START_DATE), str(COMMENT_END_DATE)))
    
    print 'Finding comment users for topics...'
    # 在comment info中找到对于某个topic的评论id和评论用户
    for topic_id in topic_dict:
        topic = topic_dict[topic_id]
        topic['comment_set'] = set()
        topic['comment_user_set'] = set()
        
    for comment_id in comment_dict:
        comment = comment_dict[comment_id]
        topic_id = comment['topic_id']
        user_id = comment['user_id']
        if topic_id in topic_dict:
            topic = topic_dict[topic_id]
            topic['comment_set'].add(comment_id)
            topic['comment_user_set'].add(user_id)
    
    path = 'tables/' + group_id + '/train-topic-' + group_id
    train_topic_file = codecs.open(path, 'w', 'utf-8')
    path = 'tables/' + group_id + '/test-topic-' + group_id
    test_topic_file = codecs.open(path, 'w', 'utf-8')
    
    print 'Generating training and test dataset...'
    # 作为训练集和测试集的topic, comment数目
    train_topic_count = 0
    train_comment_count = 0
    test_topic_count = 0
    test_comment_count = 0
    user_set = set() # 保存所有出现在训练集和测试集中的用户id
    for topic_id, topic in topic_dict.iteritems():
        topic_creator = topic['user_id']
        pubdate = topic['pubdate']
        comment_user_set = topic['comment_user_set']
        log.info('Comment user number for topic %s is: %d' % (topic_id, len(comment_user_set)))
        if is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE):
            train_topic_count += 1
            train_comment_count += len(topic['comment_user_set'])
            f = train_topic_file
        elif is_between(pubdate, TEST_START_DATE, TEST_END_DATE):
            # 保证训练集中的评论用户数至少为5
            if len(topic['comment_user_set']) < 5:
                continue
            test_topic_count += 1
            test_comment_count += len(topic['comment_user_set'])
            f = test_topic_file
            
        user_set.add(topic_creator) # add topic creator
        user_set = user_set | topic['comment_user_set'] # add comment user set
        
        row = topic['topic_id'] + '[=]' + topic['group_id'] + '[=]' + \
            topic['user_id'] + '[=]' + str(topic['pubdate']) + '[=]' + \
            topic['title'] + '[=]' + topic['content'] + '[=]' + \
            ','.join(topic['comment_set']) + '[=]' + ','.join(topic['comment_user_set'])
        row += '\n'
        f.write(row)
        
    train_topic_file.close()
    test_topic_file.close()
    # write all user ids to file
    path = 'social/' + group_id + '/all-users-' + group_id
    print 'Writing user list to file: %s' % path
    f = codecs.open(path, 'w', 'utf-8')
    for uid in user_set:
        f.write(uid + '\n')
    f.close()
    print 'Total users in train and test set: %d' % len(user_set)
    
    print 'For training, number of topics: %d, number of commenting users: %d' % (train_topic_count, train_comment_count)
    print 'For test, number of topics: %d, number of commenting users: %d' % (test_topic_count, test_comment_count)
    print 'Done'