예제 #1
0
def get_isFan(html, uids, current_uid):
    """
    :param html: samefollow page
    :param uids: list contains uids to determine this account follows or not
    :param current_uid: current crawling user
    :return: 1 for yes 0 for no
    """
    soup = BeautifulSoup(html, "html.parser")
    scripts = soup.find_all('script')
    pattern = re.compile(r'FM.view\((.*)\)')

    user_ids = list()  # Contains uids that the user and crawler both follow
    intersection_ids = list(
    )  # Contains the intersection of param uids and user_ids
    relations = list()  # Contains list to be stored in UserRelation table
    for script in scripts:
        m = pattern.search(script.string)
        # Find the <script>FM.view({"ns":"pl.content.followTab.index","domid":"Pl_Official_HisRelation__59",...
        if m and 'pl.content.followTab.index' in script.string:
            all_info = m.group(1)
            cont = json.loads(all_info).get('html', '')
            soup = BeautifulSoup(cont, 'html.parser')
            follows = soup.find(attrs={
                'class': 'follow_box'
            }).find_all(attrs={'class': 'follow_item'})
            patternUID = re.compile(r'uid=(.*?)&')
            for follow in follows:
                m = re.search(patternUID, str(follow))
                if m:
                    r = m.group(1)
                    # filter invalid ids
                    if r.isdigit():
                        user_ids.append(r)
            # Most the same with def get_fans_or_follows(html, uid, type):
            # Except the following lines calculate which uids do the user follow
            intersection_ids = list(set(user_ids).intersection(set(uids)))
            # Now store in the database
            type = 1
            n = None
            for uid in intersection_ids:
                relations.append(UserRelation(uid, current_uid, type, n,
                                              False))
            UserRelationOper.add_all(relations)
            break
    # legacy support
    if intersection_ids:
        return 1
    else:
        return 0
예제 #2
0
def get_fans_or_follows(html, uid, type):
    """
    Get fans or follows and store their relationships
    :param html: current page source
    :param uid: current user id
    :param type: type of relations, 1 stands for fans,2 stands for follows
    :return: list of fans or followers
    """
    if html == '':
        return list()

    pattern = re.compile(r'FM.view\((.*)\)')
    soup = BeautifulSoup(html, "html.parser")
    scripts = soup.find_all('script')

    user_ids = list()
    relations = list()
    for script in scripts:
        m = re.search(pattern, script.string)

        if m and 'pl.content.followTab.index' in script.string:
            all_info = m.group(1)
            cont = json.loads(all_info).get('html', '')
            soup = BeautifulSoup(cont, 'html.parser')
            follows = soup.find(attrs={
                'class': 'follow_box'
            }).find_all(attrs={'class': 'follow_item'})
            patternUID = re.compile(r'uid=(.*?)&')
            patternFROM = re.compile(r'通过.+?关注')
            for follow in follows:
                m = re.search(patternUID, str(follow))
                if m:
                    r = m.group(1)
                    # filter invalid ids
                    if r.isdigit():
                        isDuplicate = UserRelationOper.get_user_by_uid(
                            uid, r, type)
                        if not isDuplicate:
                            n = re.search(patternFROM, follow.text)
                            n = n.group(0)
                            n = n[2:len(n) - 2]
                            user_ids.append(r)
                            relations.append(UserRelation(uid, r, type, n))
            break

    UserRelationOper.add_all(relations)
    return user_ids
예제 #3
0
def get_isFan(html, uids, current_uid):
    """
    :param html: samefollow page
    :param uids: list contains uids to determine this account follows or not
    :param current_uid: current crawling user
    :return: 1 for yes 0 for no
    """
    soup = BeautifulSoup(html, "html.parser")
    scripts = soup.find_all('script')
    pattern = re.compile(r'FM.view\((.*)\)')

    user_ids = list()  # Contains uids that the user and crawler both follow
    intersection_ids = list()  # Contains the intersection of param uids and user_ids
    relations = list()  # Contains list to be stored in UserRelation table
    for script in scripts:
        m = pattern.search(script.string)
        # Find the <script>FM.view({"ns":"pl.content.followTab.index","domid":"Pl_Official_HisRelation__59",...
        if m and 'pl.content.followTab.index' in script.string:
            all_info = m.group(1)
            cont = json.loads(all_info).get('html', '')
            soup = BeautifulSoup(cont, 'html.parser')
            follows = soup.find(attrs={'class': 'follow_box'}).find_all(attrs={'class': 'follow_item S_line2'})
            patternUID = re.compile(r'uid=(.*?)&')
            for follow in follows:
                m = re.search(patternUID, str(follow))
                if m:
                    r = m.group(1)
                    # filter invalid ids
                    if r.isdigit():
                        user_ids.append(r)
            # Most the same with def get_fans_or_follows(html, uid, type):
            # Except the following lines calculate which uids do the user follow
            intersection_ids = list(set(user_ids).intersection(set(uids)))
            # Now store in the database
            type = 1
            n = None
            for uid in intersection_ids:
                relations.append(UserRelation(uid, current_uid, type, n, False))
            UserRelationOper.add_all(relations)
            break
    # legacy support
    if intersection_ids:
        return 1
    else:
        return 0