예제 #1
0
def create_influencer_from_bad_brands(brand, to_save=True):
    '''
    This method creates influencers from Brands whose domains contain blogger urls.
    Example:
        blogspot = Brands.objects.filter(domain_name__icontains='blogspot.")
        blogspot.update(blacklisted=True)
        for b in blogspot:
          create_influencer_from_bad_brands(b, True)


        Double checks:
            this function should be called only for those Brands that have not been passed through this function
            we shouldn't run this for brands with domain_name in 'tumblr.com', because these influencer could have
                a separate blog (say on blogspot.com) and then we will have duplicates

    '''
    with platformutils.OpRecorder(operation='import_from_bad_brand',
                                  brand=brand) as opr:
        url = brand.domain_name
        domain = utils.domain_from_url(url)
        if domain in BLACKLISTED_DOMAINS:
            log.info('Domain %r is blacklisted', domain)
            return
        inf = helpers.create_influencer_and_blog_platform(
            url,
            'discovered_from_brands',
            to_save,
            platform_name_fallback=True)
        if not inf:
            log.error('Blacklisted url: %r', url)
        if inf and inf.id is not None:
            opr.data = {'inf_id_created': [inf.id]}
        else:
            opr.data = {'inf_cnt_skipped': 1}
예제 #2
0
 def _prepare_test_influencer(self, op='created_for_testing'):
     infs = models.Influencer.objects.filter(relevant_to_fashion=True, show_on_search=False,
                                             source__isnull=False, classification='blog',
                                             blacklisted=False).\
         exclude(validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS).\
         exclude(validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).\
         order_by('-id')
     log.info('%d infs', infs.count())
     assert infs.exists()
     inf_to_recreate = infs[0]
     self.orig_inf = inf_to_recreate
     log.info('Recreating influencer %r', inf_to_recreate)
     orig_blog_url = inf_to_recreate.blog_url
     orig_source = inf_to_recreate.source
     self._disable_inf(inf_to_recreate)
     self.inf = helpers.create_influencer_and_blog_platform(
         orig_blog_url,
         orig_source,
         to_save=True,
         platform_name_fallback=True)
     assert self.inf is not None
     with platformutils.OpRecorder(operation=op,
                                   influencer=self.inf) as opr:
         opr.data = {'source_influencer_id': inf_to_recreate.id}
     log.info('New influencer for testing: %r', self.inf)
예제 #3
0
def search_infs_using_preloaded_urls(queries, pages=20):
    for q in queries:
        try:
            urls = collect_urls_from_google(q, pages)
        except:
            log.exception(
                'While collect_urls_from_google(%r), going to the next query',
                q)
            continue
        print "Got urls: %s" % urls
        return
        for url in urls:
            try:
                if utils.domain_from_url(
                        url) in import_from_blog_post.exclude_domains_set:
                    log.warn('%r is blacklisted', url)
                    continue
                dups = models.Influencer.find_duplicates(url)
                log.info('%r dups: %s', url, dups)
                if not dups:
                    log.info('YES_CREATE %r', url)
                    new_inf = helpers.create_influencer_and_blog_platform(
                        url, 'google', platform_name_fallback=True)
                    log.info('Created influencer: %r', new_inf)
                else:
                    log.info('NO_CREATE %r', url)
            except:
                log.exception('While processing url %r, skipping', url)
예제 #4
0
def find_and_connect_user_to_influencer(user_prof, to_save=True, **kwargs):
    """
    This method connects a userprofile with an influencer object and updates this data in intercom.
    Sending an email to admins in case of errors.
    *param: user_profile
    *return: None
    """
    from debra.models import Influencer
    from debra.helpers import create_influencer_and_blog_platform, send_admin_email_via_mailsnake
    from platformdatafetcher import platformutils, postprocessing

    blog_url = user_prof.blog_page
    influencer = create_influencer_and_blog_platform(blog_url,
                                                     'blogger_signup', to_save,
                                                     False)
    log.info("Found %r possible influencer for profile [%s %s]" %
             (influencer, user_prof.user, user_prof.blog_page))

    if not influencer:
        log.info("No influencer found for User_prof_id: %s" % (user_prof.id, ))
        send_admin_email_via_mailsnake("No influencer found for user",
                                       "User_prof_id: %s" % (user_prof.id, ))
        user_prof.error_when_connecting_to_influencer = "NO INFLUENCERS"
    else:
        log.info("Found %s influencer for signed up user %s" %
                 (influencer, user_prof))
        influencer.name = user_prof.name
        influencer.email_for_advertising_or_collaborations = user_prof.user.email
        influencer.email = user_prof.user.email
        user_prof.influencer = influencer
        influencer.shelf_user = user_prof.user
        influencer.append_source('blogger_signup')
        log.info("Done connecting User: [%s, %s] with Influencer: [%s, %s]" %
                 (user_prof.blog_page, user_prof.user.email,
                  influencer.email_for_advertising_or_collaborations,
                  influencer.blog_url))

    if to_save:
        user_prof.save()
        if influencer:
            influencer.save()
            user_prof.update_intercom()
            # if influencer is showing on search, their profile must be ok, so invite them
            if influencer.show_on_search and not influencer.ready_to_invite:
                influencer.ready_to_invite = True
                influencer.save()
                user_prof.update_intercom()
            # if they have been already qa-ed, invite them
            elif influencer.validated_on and 'info' in influencer.validated_on and not influencer.ready_to_invite:
                influencer.ready_to_invite = True
                influencer.save()
                user_prof.update_intercom()
            # now, if this influencer is not validated or not showing on search
            else:
                # issue the complete processing
                postprocessing.process_new_influencer_sequentially(
                    influencer.id, assume_blog=True)

        check_user_prof_influencer_connectivity(user_prof.id)
예제 #5
0
 def _start_processing(self):
     self.inf = helpers.create_influencer_and_blog_platform(
         TEST_BLOG_URL,
         INF_SOURCE,
         to_save=True,
         platform_name_fallback=True)
     self.processing_thread = threading.Thread(target=self._do_processing)
     self.processing_thread.start()
예제 #6
0
def import_network_bloggers(filename):

    with open(filename, 'rb') as f:
        lines = f.readlines()[1:]
    reader = csv.DictReader(lines,
                            ('unusual', 'blog_name', 'url', 'persons_name',
                             'location', 'source', 'description'))
    blogger_type = os.path.basename(filename).split('.')[0].split(' - ')[1]
    log.info('blogger_type: %r', blogger_type)
    for row in reader:
        try:
            log.info('row: %r', row)
            if not row['url'].startswith('http'):
                log.warn('Skipping row with invalid url %r', row['url'])
                continue
            source = utils.domain_from_url(row['source'])
            if not source.strip():
                log.warn('Skipping row with no source')
                continue
            if not row['url'].strip():
                log.warn('Skipping row with no url')
                continue
            inf = helpers.create_influencer_and_blog_platform(
                row['url'], source, to_save=True, platform_name_fallback=True)
            if not inf:
                log.warn('Skipping blacklisted url')
                continue
            if not inf.is_enabled_for_automated_edits():
                log.warn(
                    'Influencer is not enabled for automated edits, skipping')
                continue
            inf.blogname = row['blog_name']
            inf.blogger_type = blogger_type
            inf.name = row['persons_name']
            inf.demographics_location = row['location']
            inf.description = row['description']
            log.info(
                'source, blogname, name, location, description: %r, %r, %r, %r, %r',
                inf.source, inf.blogname, inf.name, inf.demographics_location,
                inf.description[:100])
            inf.save()

            # update blogname for blog platform
            blog_pl_q = inf.platform_set.filter(url=row['url'])
            if blog_pl_q.exists():
                blog_pl = blog_pl_q[0]
                log.info('Updating blogname of %r', blog_pl)
                blog_pl.blogname = row['blog_name']
                blog_pl.save()
        except:
            log.exception('While processing %s, skipping', row)
예제 #7
0
def import_from_blog_url(follower_id, to_save=True):
    follower = models.Follower.objects.get(id=follower_id)
    with platformutils.OpRecorder(operation='import_from_pi',
                                  follower=follower) as opr:
        url = utils.url_without_path(follower.url)
        log.info('Will check url %r', url)
        if any(invalid_s in url for invalid_s in ('@', '(', '..')):
            log.warn('Invalid follower url: %r', url)
            return
        log.info('import_from_blog_url runs for follower %r', follower)
        url = utils.resolve_http_redirect(url)
        domain = utils.domain_from_url(url)
        if domain in BLACKLISTED_DOMAINS:
            log.info('Domain %r is blacklisted', domain)
            return
        inf = helpers.create_influencer_and_blog_platform(
            url, 'comments_import', to_save)
        if not inf:
            log.error('Blacklisted url: %r', url)
        if inf and inf.id is not None:
            opr.data = {'inf_id_created': [inf.id]}
        else:
            opr.data = {'inf_cnt_skipped': 1}
예제 #8
0
    def create_post(url):
        blog_url = utils.post_to_blog_url(url)
        inf = helpers.create_influencer_and_blog_platform(
            blog_url, 'import_from_post_analytics', True, True)

        if inf:
            platform = inf.blog_platform
            print("Inf.validated_on: %r" % inf.validated_on)
            if not inf.validated_on or not 'info' in inf.validated_on:
                # it's not QA-ed yet, so let's process this sequentially
                postprocessing.process_new_influencer_sequentially(
                    inf.id, True)
            # at this point, we should have data for the influencer
            # now, let's check if got the post

            # post = Posts.objects.filter(platform=platform, url__iexact=url)
            # print("Got post: %r" % post)
            # if post.exists():
            #     return post[0]

            post = find_post_by_url(url, True, platform=platform)

            if post is None:
                # here we just create a quick post artifically (ideally we should have fetched this post)
                post = Posts.objects.create(platform=platform,
                                            influencer=inf,
                                            show_on_search=inf.show_on_search,
                                            url=url)
            return post

        print("No valid influencer found")
        helpers.send_admin_email_via_mailsnake(
            "Post Analytics: No valid influencer found %r" % url,
            "During our post analytics, we didn't find an influencer for this Post.url=%r"
            % (url))
        return None
예제 #9
0
def search_infs_by_giveaways(pages=20):
    brands = models.Brands.objects.filter(supported=True).order_by('id')[12:13]
    for brand in brands:
        for q in GOOGLE_QUERIES:
            q = q.format(brand=brand)
            log.info('Searching: %r', q)
            try:
                with xbrowser.XBrowser(headless_display=settings.
                                       AUTOCREATE_HEADLESS_DISPLAY) as xb:
                    g = GoogleScraper(xb)
                    it = g.search(q, pages)
                    for results in it:
                        for url in results:
                            try:
                                if utils.domain_from_url(
                                        url
                                ) in import_from_blog_post.exclude_domains_set:
                                    log.warn('%r is blacklisted', url)
                                    continue
                                dups = models.Influencer.find_duplicates(url)
                                log.info('%r dups: %s', url, dups)
                                if not dups:
                                    log.info('YES_CREATE %r', url)
                                    new_inf = helpers.create_influencer_and_blog_platform(
                                        url,
                                        'google',
                                        platform_name_fallback=True)
                                    log.info('Created influencer: %r', new_inf)
                                else:
                                    log.info('NO_CREATE %r', url)
                            except:
                                log.exception(
                                    'While processing url %r, skipping', url)
            except Exception as e:
                log.exception('For brand %r got exception: %s' % (brand, e),
                              extra={'pages': pages})
예제 #10
0
def create_influencer_from_instagram(profile_id, to_save):
    profile = InstagramProfile.objects.get(id=profile_id)

    existing_infs, valid_urls = find_matching_influencers_for_profile(profile)
    # We don't handle the case when there're matching influencers
    if existing_infs:
        return False, existing_infs
    '''
    algorithm:
        1. Create an influencer with a fake blog url
        2. Then create a platform object for each of the platforms that we're
           able to discover
            - It could be a youtube or facebook or pinterest or twitter
                - Mark all these platforms as autovalidated
            - Use these platforms to discover other related platforms
                - These should be automatically validated also
            - Issue fetch tasks for these automatically validated platforms
        3. Extract email if given
    '''
    plats = []
    # creating a unique influencer blog url that is concurrency-safe
    blog_url = 'http://www.theshelf.com/artificial_blog/{}.html'.format(
        int(time.time()))
    inf = helpers.create_influencer_and_blog_platform(
        blog_url,
        influencer_source='discovered_via_instagram',
        to_save=to_save,
        platform_name_fallback=True)
    log.info('Influencer object %s created/fetched.', inf.id)

    if to_save:
        inf.save()
        _ = PlatformDataOp.objects.create(
            influencer=inf, operation='inf_articial_blog_from_instagram_crawl')

    for valid_url in valid_urls:
        platform = create_platform_for_influencer(url=valid_url,
                                                  inf=inf,
                                                  profile=profile,
                                                  to_save=to_save)
        if not platform:
            continue
        if to_save:
            field_name = Influencer.platform_name_to_field[
                platform.platform_name]
            admin_helpers.handle_social_handle_updates(inf, field_name,
                                                       platform.url)
        plats.append((
            platform,
            'discovered_via_instagram',
        ))

    log.debug('After performing all urls, insta_url is: %s', inf.insta_url)

    # now, using the created platforms, see if we can create new platforms
    platformextractor.do_further_validation_using_validated_platforms(
        plats, [])

    log.debug('After do_further_validation, insta_url is: %s', inf.insta_url)

    profile.discovered_influencer = inf
    if to_save:
        profile.valid_influencer = True
        profile.save()
        for platform, _ in plats:
            fetchertasks.fetch_platform_data.apply_async(
                [
                    platform.id,
                ], queue='new_influencer')

    log.debug('Finally Influencer has insta_url: %s', inf.insta_url)
    log.debug(
        ('And finally, profile with id %s should have discovered influencer '
         'with id: %s (to_save is %s)'), profile.id, inf.id, to_save)

    # Here we are fetching email, blogname, name, locations from platforms
    get_influencers_email_name_location_for_profile(profile_id,
                                                    to_save=to_save)
    # TODO: links to other platforms using @ sign or just like (snapchat: blah)

    return True, inf
예제 #11
0
    def detect_influencer(self):
        """
        Detects influencer according to the diagram

        :return: Influencer Id
        """
        self.report_data = dict()

        # checking if this profile has been performed before (if it has any IC_* actual tags)
        tags = self.profile.tags.split()
        if any(t in self.TAGS for t in tags):
            # looks like this profile was already performed, skipping it
            return 'already_preformed'

        # removing existing discovered_influencer if any presents
        present_influencer = self.profile.discovered_influencer
        if present_influencer is not None:
            self.profile.discovered_influencer = None
            if self.save is True:
                self.profile.save()

        # Getting profile's discovered platform ids
        existing_platform_ids = self.profile.get_platform_ids_detected()
        non_social_urls = self.profile.get_non_social_urls_detected()

        log.info('Detecting influencer for InstagramProfile %s ...' %
                 self.profile.id)

        self.report_data['profile_id'] = self.profile.id
        self.report_data['existing_platform_ids_qty'] = len(
            existing_platform_ids)
        self.report_data['non_social_urls_qty'] = len(non_social_urls)

        if len(existing_platform_ids) >= 1:
            log.info('Found %s platform ids' % len(existing_platform_ids))
            # There are at least 1 discovered existing platform for this Profile
            # fetching all platforms except those with url_not_found=True
            # UPDATE: and then detecting influencers of these platforms. If there is only one influencer - using it

            active_plats = Platform.objects.filter(
                id__in=existing_platform_ids).exclude(url_not_found=True)
            active_influencers_ids = set()
            for p in active_plats:
                if p.influencer is not None:
                    active_influencers_ids.add(p.influencer.id)

            active_influencers_ids = list(active_influencers_ids)

            self.report_data['active_influencers_ids'] = active_influencers_ids

            log.info(
                'Found %s existing platforms with %s distinctive influencers' %
                (len(existing_platform_ids), len(active_influencers_ids)))

            if len(active_influencers_ids) == 1:
                # Great! Only platforms with one distinctive influencers found, working with it: adding this
                # influencer to collection, connecting it to InstagramProfile

                log.info(
                    'Found 1 influencer (%s), setting IC_one_inf_found tag, setting '
                    'influencer to InstagramProfile' %
                    active_influencers_ids[0])

                candidate_influencer = Influencer.objects.get(
                    id=active_influencers_ids[0])

                if candidate_influencer.blog_url is not None and candidate_influencer.blog_url.startswith(
                        'http://www.theshelf.com/artificial_blog/'):
                    inf = Influencer.objects.get(id=active_influencers_ids[0])

                    # TODO: connecting existing artificial influencer?
                    self.profile.discovered_influencer = candidate_influencer
                    if self.save is True:
                        self.profile.save()

                        self.add_influencer_to_discovered_collection(
                            candidate_influencer)

                        self.profile.append_mutual_exclusive_tag(
                            'IC_one_artificial_inf_found',
                            self.TAGS + self.obsolete_tags)

                    self.report_data[
                        'result'] = 'One existing influencer found (artificial/osos): %s (osos: %s / sos: %s)' % (
                            active_influencers_ids[0],
                            inf.old_show_on_search,
                            inf.show_on_search,
                        )
                    return 'IC_one_artificial_inf_found'
                else:
                    self.profile.discovered_influencer = candidate_influencer
                    if self.save is True:
                        self.profile.save()

                        self.add_influencer_to_discovered_collection(
                            candidate_influencer)

                        self.profile.append_mutual_exclusive_tag(
                            'IC_one_inf_found', self.TAGS + self.obsolete_tags)

                    self.report_data['result'] = 'One existing influencer found and set to ' \
                                                 'profile (non-artificial, non-osos): %s (osos: %s / sos: %s)' % (
                        active_influencers_ids[0],
                        candidate_influencer.old_show_on_search,
                        candidate_influencer.show_on_search,
                    )
                    return 'IC_one_inf_found'

            elif len(active_influencers_ids) > 1:
                # We discovered more than one active platforms with more than one distinctive influencers.

                log.info(
                    'Found more than 1 platform with more than 1 distinctive '
                    'Influencers, setting tag IC_many_plats_found')

                # self.profile.append_mutual_exclusive_tag('IC_many_infs_found', self.TAGS)

                infs = Influencer.objects.filter(
                    id__in=active_influencers_ids,
                    old_show_on_search=True).exclude(blacklisted=True)

                if infs.count() == 0:
                    # None found, we pick the best _select_influencer_to_stay(),
                    # connect to the profile and add to the collection

                    active_infs = Influencer.objects.filter(
                        id__in=active_influencers_ids)
                    best_one = active_infs[0]._select_influencer_to_stay(
                        list(active_infs))

                    self.profile.discovered_influencer = best_one
                    if self.save is True:
                        self.profile.save()
                        # self.add_influencer_to_discovered_collection(best_one)
                        self.profile.append_mutual_exclusive_tag(
                            'IC_best_from_several',
                            self.TAGS + self.obsolete_tags)

                    several_infs = [
                        "%s  (osos: %s / sos: %s)" %
                        (inf.id, inf.old_show_on_search, inf.show_on_search)
                        for inf in active_infs
                    ]
                    self.report_data['result'] = 'Several existing influencers found (no osos=True): %s , ' \
                                                 'taken best of them: %s  (osos: %s / sos: %s)' % (
                        several_infs,
                        best_one.id,
                        best_one.old_show_on_search,
                        best_one.show_on_search
                    )

                    return 'IC_best_from_several'

                elif infs.count() == 1:
                    # One Influencer with old_show_on_search=True found, using it
                    candidate_influencer = infs[0]
                    self.profile.discovered_influencer = candidate_influencer
                    if self.save is True:
                        self.profile.save()
                        # self.add_influencer_to_discovered_collection(candidate_influencer)
                        self.profile.append_mutual_exclusive_tag(
                            'IC_one_from_several',
                            self.TAGS + self.obsolete_tags)

                    several_infs = [
                        "%s  (osos: %s / sos: %s)" %
                        (inf.id, inf.old_show_on_search, inf.show_on_search)
                        for inf in infs
                    ]
                    self.report_data['result'] = 'Several existing influencers found: %s , taken ' \
                                                 'one of them with osos=True: %s  (osos: %s / sos: %s)' % (
                        several_infs,
                        candidate_influencer.id,
                        candidate_influencer.old_show_on_search,
                        candidate_influencer.show_on_search,
                    )

                    return 'IC_one_from_several'

                else:
                    # Multiple found - adding these to collection of duplicates
                    if self.save is True:
                        self.add_influencers_to_duplicates_collection(
                            influencers=infs)

                        self.profile.append_mutual_exclusive_tag(
                            'IC_many_infs_found',
                            self.TAGS + self.obsolete_tags)

                    self.report_data['result'] = 'Several existing influencers found: %s, taken those with osos=True ' \
                                                 'and putting them to duplicates collection.' % [
                        "%s  (osos: %s / sos: %s)" % (inf.id,
                                                      inf.old_show_on_search,
                                                      inf.show_on_search) for inf in infs
                    ]

                return 'IC_many_infs_found'

        # There are 0 discovered platforms, checking with non-social urls
        if len(non_social_urls) == 0:
            # Creating influencer with artificial url, adding it to collection, connecting it to the profile

            log.info(
                'No non-social urls found, creating artificial Influencer and adding it to the profile'
            )

            count_str = '%s' % (int(time.time()))
            blog_url = 'http://www.theshelf.com/artificial_blog/%s.html' % count_str
            inf = create_influencer_and_blog_platform(
                blog_url,
                influencer_source='discovered_via_instagram',
                to_save=True,
                platform_name_fallback=True)

            self.profile.discovered_influencer = inf
            if self.save is True:
                self.profile.save()
                # TODO: Should we create here an instagram platform too?
                self.add_influencer_to_discovered_collection(inf)
                self.profile.append_mutual_exclusive_tag(
                    'IC_artificial_inf_created',
                    self.TAGS + self.obsolete_tags)

            log.info('Adding IC_artificial_inf_created tag')

            self.report_data['result'] = 'No social/non-social platforms found - creating ' \
                                         'artificial Influencer: %s (osos: %s / sos: %s).' % (inf.id,
                                                                                              inf.old_show_on_search,
                                                                                              inf.show_on_search)

            return 'IC_artificial_inf_created'

        else:
            # There are some non-social urls -- checking if there are unique non-social urls

            # Special shortcut: if non-social urls contain liketoknow.it url. If this url is found, then using it as a
            # blog url for this future influencer

            from platformdatafetcher.producturlsextractor import get_blog_url_from_liketoknowit

            # NEW logic to check for bloggy urls
            log.info(
                '%s non-social urls found: %s, trying to find unique root domains'
                % (len(non_social_urls), non_social_urls))

            blog_urls_found = []

            from platformdatafetcher.platformextractor import collect_social_urls_from_blog_url, \
                substitute_instagram_post_urls

            # detecting if any of non-social urls are blogs
            with xbrowsermod.XBrowser(
                    headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                    load_no_images=True,
                    disable_cleanup=False,
                    timeout=60) as xb:

                # social urls chunks, we need to prepare social urls into detectable chunks like 'www-less domain/path'
                social_chunks = []
                for url in self.profile.get_social_urls_detected():
                    parsed = urlparse(url)
                    chunk = '%s%s' % (parsed.netloc[4:]
                                      if parsed.netloc.startswith('www.') else
                                      parsed.netloc, parsed.path)
                    chunk = chunk.strip('/')
                    if chunk not in social_chunks:
                        social_chunks.append(chunk)

                log.info('Social url fragments for searching: %s' %
                         social_chunks)

                # detecting if any found socials in there
                non_social_urls = self.profile.get_non_social_urls_detected()
                unique_root_domains = self.get_unique_root_domains(
                    non_social_urls)
                for k in unique_root_domains.keys():
                    non_social_url_start = unique_root_domains[k][0]

                    # checking if this url is a good liketoknow.it url and blog url can be retrieved:
                    parsed = urlparse(non_social_url_start)
                    # checking if domain is liketoknow.it
                    if parsed.netloc.lower().strip().replace('www.', '', 1) == 'liketoknow.it' and \
                            parsed.path.lower().strip('/').strip() not in ['', 'login']:

                        log.info(
                            'Liketoknow.it url detected: %r , trying to get its blog url'
                            % non_social_url_start)

                        # looks like it is a good liketoknow.it url, getting blog url
                        blog_url = get_blog_url_from_liketoknowit(
                            non_social_url_start, xb)
                        if blog_url is not None:
                            log.info(
                                'Blog url detected successfully: %r , considering it a good blog url'
                                % blog_url)
                            # adding it to blog_urls detected
                            if blog_url not in blog_urls_found:
                                blog_urls_found.append(blog_url)
                            else:
                                log.info('Blog url %r is already detected' %
                                         blog_url)
                        else:
                            log.info('Blog url was not detected')

                    else:
                        is_blog_url, non_social_url = self.is_url_a_blog(
                            non_social_url_start, self.profile)
                        log.info('Checking if %r is a blog:' % non_social_url)
                        if is_blog_url is True and non_social_url is not None:
                            log.info('Perfect, %r is a blog' % non_social_url)
                            socials_detected = []
                            found_soc_urls = defaultdict(list)
                            collect_social_urls_from_blog_url(
                                xb=xb,
                                by_pname=found_soc_urls,
                                platform=None,
                                non_social_url=non_social_url)

                            substitute_instagram_post_urls(found_soc_urls)

                            log.info('SOCIAL URLS COLLECTED: %s' %
                                     found_soc_urls)

                            # if no social urls were collected, we're checking if this non-social url has
                            # social urls in any form with regexps by its content and iframes.
                            if len(found_soc_urls) == 0:
                                scraped_social_urls = collect_any_social_urls(
                                    xb=xb, non_social_url=non_social_url)
                                log.info(
                                    'Thorough search found %s candidate social urls '
                                    'to check' % len(scraped_social_urls))
                                found_soc_urls[
                                    'Bruteforce'] = scraped_social_urls

                            # found_socials is in format {'Instagram': ['url1', 'url2',...], 'Facebook': [...], ...}
                            for social_url_lst in found_soc_urls.values():
                                for social_url in social_url_lst:
                                    if any([
                                            sc.lower() in social_url.lower()
                                            for sc in social_chunks
                                    ]):
                                        # we found one of social chunks in detected social url
                                        if social_url not in socials_detected:
                                            socials_detected.append(social_url)

                            log.info('Positively matched social urls: %s' %
                                     socials_detected)

                            # if we found some matching social urls - then it is a blog url, TA-DAAAA!
                            if len(socials_detected) > 0:
                                if non_social_url not in blog_urls_found:
                                    # TODO: should we use here self.is_url_a_blog(url, self.profile) for extra blog check?
                                    blog_urls_found.append(non_social_url)
                                    log.info(
                                        'Considering url %r to be a blog url for this profile'
                                        % non_social_url)

                        else:
                            log.info(
                                'Url %r considered as non-blog url or is unreachable'
                                % non_social_url_start)

            if len(blog_urls_found) == 1:
                # we found 1 blog url
                log.info('Looks like it is a new single blog url!')
                self.report_data['unique_root_domain_is_blog'] = True

                # Here we have found 0 existing platforms, but we detected that a single non-social url
                # is a BLOG. So we create a blog platform with this url, creating an influencer, connecting
                # this blog platform to this influencer and connecting the influencer to the profile.

                # creating new blog platform
                inf = create_influencer_and_blog_platform(
                    blog_url=blog_urls_found[0],
                    influencer_source='ic_from_insta_profile',
                    to_save=self.save,
                    platform_name_fallback=True)
                self.profile.discovered_influencer = inf
                log.info('A new influencer has been created: %s' % inf)
                if self.save is True:
                    self.profile.save()
                    self.add_influencer_to_discovered_collection(inf)
                    self.profile.append_mutual_exclusive_tag(
                        'IC_new_blog_new_inf', self.TAGS + self.obsolete_tags)

                self.report_data['result'] = 'New influencer %s (osos: %s / sos: %s) created by single ' \
                                             'non-social blog platform' % (inf.id,
                                                                           inf.old_show_on_search,
                                                                           inf.show_on_search)

                return 'IC_new_blog_new_inf'

            elif len(blog_urls_found) == 0:
                # if none found to be a blog
                #   => check if the length of the url > 20 chars (typically identifies as a
                #           product) => then this profile needs to be fetched again later
                #     => create a new field "date_to_fetch_later" in InstagramProfile and update this field
                #           with today+10 days later
                #     => need to create a celery task that checks if today is the day when they should be
                #           re-fetched and then clears up this date_to_fetch_later to None
                #     => after fetching the profile, compare the old url and description with new one, check
                #           if it's different, then pass it to the same pipeline as it was originally part of

                log.info('No blog urls were detected within non_social_urls')

                # TODO: what should we do if this already has date_to_fetch_later != None ?
                long_url = False
                for non_social_url in non_social_urls:
                    if len(non_social_url) > 20:
                        self.profile.date_to_fetch_later = datetime.now(
                        ) + timedelta(days=10)
                        if self.save is True:
                            self.profile.save()
                        long_url = True
                        break

                if long_url is True:

                    self.report_data[
                        'result'] = 'No blog urls were found, retrying in 10 days'
                    return '10_days_later'
                else:
                    # TODO: What should we do here, should we create an artificial url?

                    if self.save is True:
                        self.profile.append_mutual_exclusive_tag(
                            'IC_possible_brand',
                            self.TAGS + self.obsolete_tags)

                    self.report_data[
                        'result'] = 'Profile considered to be possibly a brand.'
                    return 'IC_possible_brand'

            else:
                # TODO: Skipping for now...

                log.info(
                    'We found many non-social blog domains, setting IC_many_nonsocial_found tag:'
                    % blog_urls_found)

                if self.save is True:
                    self.profile.append_mutual_exclusive_tag(
                        'IC_many_nonsocial_found',
                        self.TAGS + self.obsolete_tags)

                self.report_data[
                    'result'] = 'Multiple unique root domains found, skipped for now'
                return 'IC_many_nonsocial_found'