Пример #1
0
    def load(self, to_save=False):
        from debra.models import (InfluencersGroup,
            DemographicsLocality, Influencer, InfluencerBrandMapping,)

        fields_data = self.get_fields_data()
        if self.limit:
            fields_data = fields_data[:self.limit]

        infs = []

        if to_save:
            tag = InfluencersGroup.objects.get(
                id=constants.R29_CUSTOM_DATA_TAG_ID)

            for data in fields_data:
                inf = Influencer(**data['influencer'])

                demographics_locality = DemographicsLocality.objects.get_or_create(
                    **data['demographics'])[0]
                inf.demographics_locality = demographics_locality
                inf.save(bypass_checks=True)

                tag.add_influencer(inf)

                brand_mapping = InfluencerBrandMapping(**data['brand_mapping'])
                brand_mapping.influencer = inf
                brand_mapping.save()

                infs.append(inf)

        return infs
Пример #2
0
def handle_influencer_duplicates(typ):
    assert typ in ('validated', 'non_validated')
    if typ == 'validated':
        infs = Influencer.objects.filter(
            validated_on__contains=constants.
            ADMIN_TABLE_INFLUENCER_INFORMATIONS).exclude(
                validated_on__contains=constants.
                ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).exclude(blacklisted=True)
    elif typ == 'non_validated':
        infs = Influencer.objects.exclude(
            validated_on__contains=constants.
            ADMIN_TABLE_INFLUENCER_INFORMATIONS).exclude(
                validated_on__contains=constants.
                ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).exclude(
                    blacklisted=True).exclude(source__isnull=True).exclude(
                        blog_url__isnull=True).exclude(blog_url='')
    else:
        assert False

    infs_count = infs.count()
    log.info('Handling duplicates for %r infs', infs_count)

    for i, inf in enumerate(infs.iterator()):
        log.info('Processing %d/%d %r', i + 1, infs_count, inf)
        try:
            if Influencer.find_duplicates(inf.blog_url, inf.id):
                inf.handle_duplicates()
        except:
            log.exception('While handling duplicates for %r, skipping', inf)
Пример #3
0
def prepare_rawblogurl(raw_blog):
    """
    :param raw_blog: a `debra.models.BlogUrlsRaw` instance

    Check wether an influencer exist with that blog_url. If there's no influencer it creates one.
    Then get or creates a blog platform and mark the platform as INVESTIGATING.
    """
    duplicate_infs = Influencer.find_duplicates(blog_url=raw_blog.blog_url)

    if duplicate_infs:
        inf = duplicate_infs[0]
        inf.handle_duplicates()
    else:
        inf = Influencer.objects.create(source='lookbook', blog_url=raw_blog.blog_url)
    print raw_blog.site_url
    # lookbook_plat = Platform.objects.get_or_create(url=raw_blog.blog_url, platform_name="lookbook", influencer=inf) # why?

    if 'wordpress' in raw_blog.blog_url.lower():
        platform_name = 'Wordpress'
    elif 'blogspot' in raw_blog.blog_url.lower():
        platform_name = 'Blogspot'
    else:
        platform_name = 'Custom'

    blog_platform, created = Platform.objects.get_or_create(url=raw_blog.blog_url, platform_name=platform_name, influencer=inf)
    blog_platform.platform_state = "INVESTIGATING"
    blog_platform.save()

    determine_platform_state(blog_platform)
Пример #4
0
def run_handle_duplicates_for_influencer(influencer_id):
    influencer = Influencer.objects.get(id=influencer_id)
    with platformutils.OpRecorder(operation='handle_inf_duplicates', influencer=influencer) as opr:
        dups = Influencer.find_duplicates(influencer.blog_url, influencer.id)
        if dups:
            log.info('Found %d duplicates, running handle_duplicates')
            influencer.handle_duplicates()
        else:
            log.info('No duplicates found')
Пример #5
0
def handle_influencer_duplicates_with_checks(max_id=999999):
    influencers = Influencer.objects.filter(
        validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS
    ).exclude(validated_on__contains=constants.
              ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).exclude(
                  blacklisted=True).filter(id__lte=int(max_id)).order_by('-id')
    for influencer in influencers:
        dups = Influencer.find_duplicates(influencer.blog_url)
        if len(dups) in (0, 1):
            log.info('OK %r', influencer)
            continue
        log.info('%d dups for %r', len(dups), influencer)

        before_with_shelf_user = [
            inf for inf in dups if inf.shelf_user is not None
        ]
        valid_platform_names_in_dups = {
            plat.platform_name
            for inf in dups
            for plat in inf.platform_set.exclude(url_not_found=True)
        }

        # run de-duplication
        selected = influencer.handle_duplicates()
        log.info('Selected: %r', selected)

        not_selected = [inf for inf in dups if inf.id != selected.id]
        assert len(not_selected) == len(dups) - 1

        # refresh old dups objects and selected
        dups = [Influencer.objects.get(id=inf.id) for inf in dups]
        selected = Influencer.objects.get(id=selected.id)

        after_with_shelf_user = [
            inf for inf in dups if inf.shelf_user is not None
        ]
        log.info('before/after with_shelf_user: %d %d',
                 len(before_with_shelf_user), len(after_with_shelf_user))
        assert len(before_with_shelf_user) <= len(after_with_shelf_user)

        valid_platform_names_in_selected = {
            plat.platform_name
            for plat in selected.platform_set.exclude(url_not_found=True)
        }
        log.info('platform_names in dups/selected: %s %s',
                 valid_platform_names_in_dups,
                 valid_platform_names_in_selected)
        assert valid_platform_names_in_dups == valid_platform_names_in_selected

        not_selected_validated = [
            inf for inf in not_selected
            if not inf.is_enabled_for_automated_edits()
        ]
        log.info('Not selected validated: %s', not_selected_validated)
        # if selected is not validated, check if we are not disabling validated
        if selected.is_enabled_for_automated_edits():
            assert not not_selected_validated
Пример #6
0
def set_source_spreadsheet(filename):
    reader = spreadsheet_reader(filename)
    for row in reader:
        duplicate_infs = Influencer.find_duplicates(blog_url=row['url'])
        if len(duplicate_infs) > 0:
            inf = duplicate_infs[0]
            if inf.source == 'spreadsheet_import':
                print 'Influencer %s has source set to spreadsheet_import' % inf
            else:
                inf.source = 'spreadsheet_import'
                inf.save()
                print 'Updated influencer %s source to spreadsheet_import' % inf
        else:
            print '!!! No influencers for blog_url=%s' % row['url']
Пример #7
0
def find_influencer_duplicates():
    res = []
    infs = Influencer.objects.filter(
        validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS
    ).exclude(validated_on__contains=constants.
              ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).exclude(blacklisted=True)
    for inf in infs:
        dups = Influencer.find_duplicates(inf.blog_url, inf.id)
        if dups:
            log.info('YES_DUP %s %r %r', inf.id, inf, dups)
            res.append(inf)
        else:
            log.info('NO_DUP %s %r', inf.id, inf)
    log.info('Total duplicates: %s', len(res))
    pprint.pprint(res)
    return res
Пример #8
0
def blogger_autocomplete(request):
    """
    similar to user_autocomplete, but for the fact that it operates on Influencers instead of UserProfile's
    @return json string containing search results
    """
    term = request.GET.get('term')

    influencers = Influencer.raw_influencers_for_search()
    matched_influencers = influencers.filter(name__isnull=False).filter(Q(name__icontains=term) |
                                                                        Q(email__icontains=term) |
                                                                        Q(platform__blogname__icontains=term))

    results = []
    for inf in matched_influencers:
        results.append({
            'id': inf.id,
            'name': unescape(inf.name),
            'img': unescape(inf.name)
        })

    return HttpResponse(status=200, content=json.dumps({'results': results}))
Пример #9
0
def create_influencers_platforms_from_csv(filename,
                                          from_row='1',
                                          to_row='999999'):
    """Works with https://docs.google.com/spreadsheet/ccc?key=0Ai2GPRwzn6lmdEMzWVR0aldXYXJodGplZlVGRVMyQ1E&usp=sharing . To download CSV, add output=csv to the link: https://docs.google.com/spreadsheet/ccc?key=0Ai2GPRwzn6lmdEMzWVR0aldXYXJodGplZlVGRVMyQ1E&usp=sharing&output=csv
    """
    reader = spreadsheet_reader(filename)
    count = 0
    from_row = int(from_row)
    to_row = int(to_row)
    for row in reader:
        print "\n\nCount: %d" % count
        count += 1
        if count < from_row:
            print 'Skipping row %d' % count
            continue
        if count > to_row:
            print 'Skipping row %d' % count
            continue
        if row['email'] == 'email':
            # First title row
            continue
        if not (row['url'] or '').strip():
            # Empty row
            continue
        print 'Processing row %r' % row
        duplicate_infs = Influencer.find_duplicates(blog_url=row['url'])
        if len(duplicate_infs) > 0:
            inf = duplicate_infs[0]
            inf.handle_duplicates()
            print 'Using already saved influencer: %r' % inf
        else:
            inf = Influencer()
        #update info
        inf.source = 'spreadsheet_import'
        inf.name = row['blogger_name']
        inf.blog_url = row['url']
        inf.email = row['email']
        inf.demographics_location = row['location']
        inf.demographics_gender = row['gender']
        assert False, 'This script requires code update to *_url fields processing'
        if row['Facebook']:
            inf.fb_url = row['Facebook']
        if row['Pinterest']:
            inf.pin_url = row['Pinterest']
        if row['Twitter']:
            inf.tw_url = row['Twitter']
        if row['Instagram']:
            inf.insta_url = row['Instagram']
        if row['Bloglovin']:
            inf.bloglovin_url = row['Bloglovin']
        inf.save()
        print 'Saved new influencer: %r' % inf

        # Try to save blog as platform
        if row['url']:
            blog_pl = Platform.objects.filter(url=row['url'])
            if blog_pl.exists():
                print "Blog already exists for url %s [%s]" % (row['url'],
                                                               blog_pl)
            else:
                discovered_pl, corrected_url = fetcher.try_detect_platform_name(
                    row['url'])
                if discovered_pl:
                    blog_pl = Platform.find_duplicates(inf, row['url'],
                                                       discovered_pl)
                    if blog_pl and len(blog_pl) > 0:
                        blog_pl = blog_pl[0]
                        blog_pl = blog_pl.handle_duplicates()
                    else:
                        blog_pl = Platform()
                    blog_pl.influencer = inf
                    blog_pl.platform_name = discovered_pl
                    blog_pl.url = row['url']
                    blog_pl.blogname = row['blog_name']
                    blog_pl.save()
                    print 'Saved platform from blog data: %r' % blog_pl
                else:
                    print 'No platform discovered for blog url %r' % row['url']

        for platform_name in ('Facebook', 'Twitter', 'Pinterest', 'Bloglovin',
                              'Instagram'):
            if not row[platform_name]:
                print 'No url for platform %r' % platform_name
                continue
            pl = Platform.find_duplicates(inf, row[platform_name],
                                          platform_name)
            if pl and len(pl) > 0:
                pl = pl[0]
                pl = pl.handle_duplicates()
            else:
                pl = Platform()
            pl.influencer = inf
            pl.platform_name = platform_name
            pl.url = row[platform_name]
            pl.save()
            print 'Saved new platform %r' % pl