Пример #1
0
def index_products_data():
    count = 1
    data = []
    for product_data in models.scrape_li:
        review_count = 1
        reviews = product_data['reviews']
        for review in reviews:
            r = models.Review()
            r.reviewid = product_data['url'] + "?review=" + str(review_count)
            review_count = review_count + 1
            r.perfume = product_data['perfume']
            r.site = product_data['site']
            r.brand_name = product_data['brand_name']
            r.brand_variant = product_data['brand_variant']
            r.review_date = datetime.strptime(review['date'],
                                              '%b %d %Y').date()
            r.review = review['body']
            r.label = review['label']
            r.accords = product_data.get('accords', {})
            notespyramid = product_data.get('notespyramid', {})
            r.notespyramid = notespyramid.get('top', [])
            r.moods = product_data.get('moods', {})
            r.notes = product_data.get('notes', {})
            r.longevity = product_data.get('longevity', {})
            r.sillage = product_data.get('sillage', {})
            r.ratings = product_data.get('ratings', {})
            r.img_src = product_data.get('img_src', "")
            data.append(elastic.convert_for_bulk(r, 'update'))
            count = count + 1
            if count > 100:
                bulk(models.client, actions=data, stats_only=True)
                count = 1
    bulk(models.client, actions=data, stats_only=True)
Пример #2
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        site_url = urlparse(url).netloc.split('.')[1]
        sub_site_url = urlparse(url).path.split('/')
        sub_site_name = '-'.join(sub_site[1:-1])
        if sub_site_name == '':
            sub_site_name = 'Home'
        pagemap = models.PageMap()

        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url
        pagemap.section = ''

        try:  # get posted date
            pagemap.published_date = datetime.today()
        except:
            pass
        try:  # get page
            pagemap.page = bs.get_text()
        except:
            pass
        try:  # get title
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = ''
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data
Пример #3
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        pagemap = models.PageMap()
        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url

        # get posted date
        try:
            pagemap.posted_date = datetime.today()
            author_info_tag = bs.find("div", class_="author_info")
            published = author_info_tag.find('p', class_='date').text
            pagemap.posted_date = datetime.strptime(published, '%d-%b-%Y')
        except:
            pass
        try:
            box_1_tag = bs.find("div", class_="box_1")
            product_info_bar_tag = box_1_tag.find("div",
                                                  class_="product_info_bar")
            published = re.search(r'([0-9]{2}-[a-z,A-Z]{3}-[0-9]{4})',
                                  product_info_bar.text, re.MULTILINE)
            pagemap.posted_date = datetime.strptime(published.group(0),
                                                    '%d-%b-%Y')
        except:
            pass
        # get page
        try:
            pagemap.page = bs.get_text()
            box_1_tag = bs.find("div", class_="box_1")
            pagemap.page = box_1_tag.text
            product_main_text_tag = box_1_tag.find("div",
                                                   class_="product_main_text")
            if product_main_text_tag != None:
                pagemap.page = product_main_text_tag.text
            else:
                story_tag = box_1_tag.find("div", class_="story")
                pagemap.page = story_tag.text
        except:
            pass
        # get title
        try:
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = ''
            box_1_tag = bs.find("div", class_="box_1")
            pagemap.title = box_1_tag.find("h1").text
        except:
            pass
        # get section
        try:
            box_2_tag = bs.find("div", class_="box_2")
            pagemap.section = box_2_tag.text.strip(' \t\n\r')
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data
Пример #4
0
def crawl_studies_facts(survey_field, facts_d):
    bulk_data = []
    count = 0
    total_count = 0
    facts_df = DataFrame.from_dict(facts_d, orient='index')
    facts_df['blindcode'] = [ix[0] for ix in facts_df.index]
    facts_df['fact'] = [ix[1] for ix in facts_df.index]
    facts_df['answer'] = [ix[2] for ix in facts_df.index]

    for blindcode, facts_blindcode_df in facts_df.groupby(
            facts_df['blindcode']):
        se = models.StudiesMap()
        se.cft_id = blindcode
        se.dataset = survey_field
        se.ingr_name = blindcode
        se.IPC = blindcode
        percentile = {}

        for idx, fact_s in facts_blindcode_df.iterrows():
            fact = fact_s['fact']
            answer = fact_s['answer']
            #se.supplier = "CI"
            #se.olfactive = cft_s.olfactive
            #se.region = cft_s.region
            #se.review = cft_s.review
            #se.dilution = cft_s.dilution
            #se.intensity = cft_s.intensity

            if fact not in percentile.keys():
                percentile[fact] = []
            val = answer
            prc = fact_s[0]
            if prc > 0:
                percentile[fact].append((val, prc))

        for fact in percentile.keys():
            if fact == 'emotion':
                se.emotion = percentile[fact]
            if fact == 'suitable_stage':
                se.suitable_stage = percentile[fact]
            if fact == 'hedonics':
                se.hedonics = percentile[fact]
            if fact == 'freshness':
                se.freshness = percentile[fact]

        data = elastic.convert_for_bulk(se, 'update')
        bulk_data.append(data)
        count = count + 1
        if count > 100:
            bulk(models.client, actions=bulk_data, stats_only=True)
            total_count = total_count + count
            print(
                "crawl_studies_facts: written another batch, total written {0:d}"
                .format(total_count))
            bulk_data = []
            count = 1

    bulk(models.client, actions=bulk_data, stats_only=True)
    pass
Пример #5
0
def load_scentemotion(cft_filename):
    ml_file = 'data/' + cft_filename
    cft_df = pd.read_csv(ml_file, sep=';', encoding='ISO-8859-1', low_memory=False)
    cft_df.fillna(0, inplace=True)
    cft_df.index = cft_df['cft_id']
    bulk_data = []
    count = 0
    total_count = 0
    for cft_id, cft_s in cft_df.iterrows():
        se = models.ScentemotionMap()
        se.cft_id = cft_id
        se.dataset = "ingredients"
        se.ingr_name = cft_s.ingr_name
        se.IPC = cft_s.IPC
        se.supplier = cft_s.supplier
        se.olfactive = cft_s.olfactive
        se.region = cft_s.region
        se.review = cft_s.review
        se.dilution = cft_s.dilution
        se.intensity = cft_s.intensity

        percentile = {}
        for col in cft_s.index:
            col_l = col.split("_", 1)
            fct = col_l[0]
            if fct not in ["mood", "smell", "negative", "descriptor", "color", "texture"]:
                continue
            if fct not in percentile.keys():
                percentile[fct] = []
            val = col_l[1]
            prc = cft_s[col]
            if prc > 0:
                #percentile[fct].append((val, "{0:4.2f}".format(prc)))
                percentile[fct].append((val, prc))

        se.mood = percentile["mood"]
        se.smell = percentile["smell"]
        se.negative = percentile["negative"]
        se.descriptor = percentile["descriptor"]
        se.color = percentile["color"]
        se.texture = percentile["texture"]

        data = elastic.convert_for_bulk(se, 'update')
        bulk_data.append(data)
        count = count + 1
        if count > 100:
            bulk(models.client, actions=bulk_data, stats_only=True)
            total_count = total_count + count
            print("load_scentemotion: written another batch, total written {0:d}".format(total_count))
            bulk_data = []
            count = 1

    bulk(models.client, actions=bulk_data, stats_only=True)
    pass
Пример #6
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        pagemap = models.PageMap()
        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url

        # get posted date
        # <span class="entry-date">May 23, 2017</span>
        try:
            pagemap.posted_date = datetime.today()
            entry_date_tag = bs.find("span", class_="entry-date")
            published = entry_date_tag.text
            pagemap.posted_date = datetime.strptime(published, '%B %d, %Y')
        except:
            pass
        #try:
        #    box_1_tag = bs.find("div", class_="box_1")
        #    product_info_bar_tag = box_1_tag.find("div", class_="product_info_bar")
        #    published = re.search(r'([0-9]{2}-[a-z,A-Z]{3}-[0-9]{4})', product_info_bar.text, re.MULTILINE)
        #    pagemap.posted_date = datetime.strptime(published.group(0), '%d-%b-%Y')
        #except:
        #    pass

        # get page
        # <section class="entry-content">
        try:
            pagemap.page = bs.get_text()
            entry_content_tag = bs.find("section", class_="entry-content")
            pagemap.page = entry_content_tag.text
        except:
            pass
        # get title
        # <h1 class="entry-title"></h1>  text
        try:
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = ''
            entry_title_tag = bs.find("h1", class_="entry-title")
            pagemap.title = entry_title_tag.text
        except:
            pass
        # get section
        try:
            pagemap.section = sub_site
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data
Пример #7
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        pagemap = models.PageMap()
        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url

        # get posted date
        # <span class="entry-date">May 23, 2017</span>
        try:
            pagemap.published_date = datetime.today()
            entry_date_tag = bs.find("span", class_="entry-date")
            published = entry_date_tag.text
            pagemap.published_date = datetime.strptime(published,
                                                       '%B %d, %Y').date()
        except:
            pass

        # get page
        # <section class="entry-content">
        try:
            pagemap.page = bs.get_text()
            entry_content_tag = bs.find("section", class_="entry-content")
            pagemap.page = entry_content_tag.text
        except:
            pass
        # get title
        # <h1 class="entry-title"></h1>  text
        try:
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = ''
            entry_title_tag = bs.find("h1", class_="entry-title")
            pagemap.title = entry_title_tag.text
        except:
            pass
        # get section
        try:
            pagemap.section = sub_site
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data
Пример #8
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        pagemap = models.PageMap()
        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url

        article_tag = bs.find('article')
        try:  # posted date
            published = article_tag.find('time').text
            pagemap.published_date = datetime.strptime(published,
                                                       '%d-%b-%Y').date()
        except:
            pass
        try:  # title
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = article_tag.header.h1.text
        except:
            pass
        try:  # section
            if sub_site in ['Skin-care', 'Hair-care']:
                pagemap.section = article_tag.header.p.text.strip()
            else:
                pagemap.section = 'blog'
        except:
            pass
        try:  # img_src
            pagemap.img_src = article_tag.header.figure.img.attrs['src']
        except:
            pass
        try:  # page
            pagemap.page = article_tag.find('div',
                                            class_='Detail-content').text
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data
Пример #9
0
def push_review_to_index():
    count = 1
    data = []
    for scrape_perfume in models.scrape_li:
        review_count =1
        scrape_reviews = scrape_perfume[1][4]
        for scrape_review in scrape_reviews:
            review = models.Review()
            review.reviewid = scrape_perfume[1][0] + "?review=" + str(review_count)
            review_count = review_count + 1
            review.perfume = scrape_perfume[0]
            review.review_date = datetime.strptime(scrape_review[0],'%b %d %Y').date()
            review.review = scrape_review[1]
            review.label = scrape_review[2]
            review.accords = scrape_perfume[1][1]
            review.img_src = scrape_perfume[1][5]
            if count < 100:
                data.append(elastic.convert_for_bulk(review, 'update'))
            count = count + 1

    bulk(models.client, actions=data, stats_only=True)
Пример #10
0
def push_posts_to_index():
    id = 1
    data = []
    for index, sp_post in models.posts_df.iterrows():
        mi_post = models.PostMap()
        mi_post.post_id = sp_post.post_id
        if sp_post.editor_id in editor_map:
            mi_post.editor_id = editor_map[sp_post.editor_id]
        else:
            mi_post.editor_id = sp_post.editor_id
        mi_post.published_date = datetime.strptime(
            sp_post.published_date[0:10], '%Y-%m-%d').date()
        if len(sp_post.post_category_id['results']) > 0:
            post_category_id = sp_post.post_category_id['results'][0]
        else:
            post_category_id = 0
        if post_category_id in categoy_map:
            mi_post.post_category_id = categoy_map[post_category_id]
        else:
            mi_post.post_category_id = post_category_id
        mi_post.title = sp_post.title.encode("ascii", 'replace')
        mi_post.relevance, mi_post.subject, mi_post.topline, mi_post.source, mi_post.article = scrape_body(
            mi_post.title, sp_post.body.encode("ascii", 'replace'))
        try:
            mi_post.average_rating = float(sp_post.average_rating)
            mi_post.rating_count = int(sp_post.rating_count)
            mi_post.num_comments_id = int(sp_post.num_comments_id)
        except:
            print("conversion failed", sp_post.average_rating)

        data.append(elastic.convert_for_bulk(mi_post, 'update'))
        id = id + 1


# To add a link the next URL is needed https://iffconnect.iff.com/Fragrances/marketintelligence/Lists/Posts/ViewPost.aspx?ID=2922

    bulk(models.client, actions=data, stats_only=True)
Пример #11
0
def crawl_feedly(from_dt, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_dt, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2020-jul-10)
        "Authorization":
        "Azsr6uaruKGMnymDVmYUkDrF33mC2csnyv1OScN4hpsnH5w2ngb0zEBlwyAo4izpB3W3a2RYDAW99xYFM61U5g0U13M59tiAjZFqHkVpAXVeG8PAYl5Y060wwErrxvjj12UNeQ4bk23mzCcoa9AAJtBvUMl_DZl2-jaX0cf_vmlZuVMQh-B2Srv1FUEkno3fbVJtTdZeOc1YP29aRluNyYndpm2CWYKFjaeL1LicHbObhdjgHQAZ-EFUUDCA:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    feedlymap.url = ""
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(
                            last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    if 'title' in entry:
                        feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    if len(feedlymap.url) == 0:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            if n > 0:
                                feedlymap.url = entry['originId'][n:]
                    if len(feedlymap.url) == 0:
                        if 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Пример #12
0
def crawl_feedly(from_date, rss_field):
    global headers

    bulk_data = []
    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_date, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2017-07-20)
        "Authorization":
        "A2JxorrfeTBQbMUsDIU3_zexSwY8191e3P9EvewYowjfbhKwOgHk84ErlXAWXpucZ_McfTDHLZN6yLxWqxgjWM8Upp1c-6Nb_RpZd0jWA9mJkVLN1JTETefaVNZtZqzTGTf8_qeT2ZE8z6Bf4LqLOUfQaQH2-jj8XIaxAyWMZ5BDRtfpgwVYrEEM2ii5KXnMJZxGNEvcqAV4Dke_subaM-wlnC8N63g:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        feed_category = feed['categories'][0]['label']
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or feed_category == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        feedlymap.published_date = datetime(
                            2010, 1, 1, 00, 00, 00)
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    else:
                        n = entry['originId'].find('http')
                        feedlymap.url = entry['originId'][n:]
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)

    bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Пример #13
0
def crawl_survey(survey_filename):
    ml_file = 'data/' + survey_filename
    survey_df = pd.read_csv(ml_file,
                            sep=';',
                            encoding='ISO-8859-1',
                            low_memory=False)
    survey_df.fillna(0, inplace=True)
    # col_map[column]: (field, question, answer, dashboard)
    # field_map[field]: [question=0, answer=1, column=2)]
    field_map, col_map = survey.map_columns(survey_df.columns)
    survey_df.index = survey_df[field_map['resp_id'][0][2]]
    bulk_data = []
    count = 0
    total_count = 0
    for resp_id, survey_s in survey_df.iterrows():
        sl = models.SurveyMap()
        resp_id = survey.answer_value_to_string(
            survey_s[field_map['resp_id'][0][2]])
        blindcode = survey.answer_value_to_string(
            survey_s[field_map['blindcode'][0][2]])
        sl.resp_id = resp_id + "_" + blindcode
        sl.survey = survey_filename
        sl.children = {}
        sl.concept = {}
        sl.emotion = {}
        sl.fragrattr = {}
        sl.mood = {}
        sl.smell = {}
        sl.suitable_product = {}
        sl.suitable_stage = {}
        sl.question = {}
        for field, maps in field_map.items():
            # resp_id is the unique id of the record, this is already set above
            if field == 'resp_id':
                continue
            # map: 0=answer, 1=column
            map = maps[0]
            answer_value = survey_s[map[2]]
            answer_value = survey.answer_value_to_string(answer_value)
            answer_value = survey.answer_value_encode(map[1], answer_value)
            # column mapping, no question
            if map[0] == None:
                # in case of multiple mapping search for the column that has a value
                for ix in range(1, len(maps)):
                    map = maps[ix]
                    answer_value_2 = survey_s[map[2]]
                    answer_value_2 = survey.answer_value_to_string(
                        answer_value_2)
                    if (field == 'blindcode'):
                        answer_value = answer_value + '-' + answer_value_2[:3]
                    else:
                        if len(answer_value_2) > len(answer_value):
                            answer_value = answer_value_2
                setattr(sl, field, answer_value)
            # answer mapping
            else:
                setattr(sl, field, {map[1]: answer_value})
                attr = getattr(sl, field)
                for ix in range(1, len(maps)):
                    map = maps[ix]
                    answer_value = survey_s[map[2]]
                    answer_value = survey.answer_value_to_string(answer_value)
                    answer_value = survey.answer_value_encode(
                        map[1], answer_value)
                    attr[map[1]] = answer_value
                    #attr.append({map[1]: answer_value})

        data = elastic.convert_for_bulk(sl, 'update')
        bulk_data.append(data)
        count = count + 1
        if count > 100:
            bulk(models.client, actions=bulk_data, stats_only=True)
            total_count = total_count + count
            print("crawl_survey: written another batch, total written {0:d}".
                  format(total_count))
            bulk_data = []
            count = 1
            #break

    bulk(models.client, actions=bulk_data, stats_only=True)
    pass
Пример #14
0
def crawl_feedly(from_date, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_date, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2018-feb-02)
        "Authorization":
        "A1j2bsImQdCENT7FyWxSWABu7_KwSQOKNvAySLwJQlQT3QoRlpur6iG56Xju8owoOfMF7byi1ApQUIHbUpsEBoFH-CijTCUi72hl1U1MG7eaY07ctFiEbL-e9D17yUdq3OT3iRoE04F0_1h-JcUBP513gnObI0JxD0LQk4bagAv3b22ot3jbXLoLoQgBPbBf4eKS97oyGntWM_3GMa66m1ElrAeP5R42V25WPqXZmmEwAouivQp31kDLxqFLIA:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(
                            last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    if 'title' in entry:
                        feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    else:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            feedlymap.url = entry['originId'][n:]
                        elif 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Пример #15
0
def crawl_feedly(from_dt, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_dt, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2018-aug-26)
        "Authorization":
        "A3iuGsp9UjnsSiLwl5ZoPrLZj3mO4d16muxgezgpLesPhJ4YoKgC0XdiW_ucnm7b1Z-o5DKK6oLqoW9SRNUkoTcQ8npBBmqbOF03zF3tFWaNI0Lir_hrAahmVuypG5BXVZidJJ4PuaXr4zg5pYRE32OxO0N05X_A2sdZC93oWwQU1GVLJ9evh3qmu0WXYPVXpxffytgnFjUg2JB1zGK3KJkbDl-6ioJudiD2IZczA0R52tPwFZZ0FimkE3zV:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    feedlymap.url = ""
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(
                            last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    if 'title' in entry:
                        feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    if len(feedlymap.url) == 0:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            if n > 0:
                                feedlymap.url = entry['originId'][n:]
                    if len(feedlymap.url) == 0:
                        if 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Пример #16
0
def crawl_feedly(from_date, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_date, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2017-dec-07)
        "Authorization":
        "AzNr8sCFyRuIX3upnzA-VnUUebvthkUEF0R9bccg352muEznNt9hK9m4kj8ljkQvFfoVGDYHZcLBFKuFgXRVy4HN1sVV2WYowIsQZ7lTGxB9WYNqxRGimPyZUAijHL7ugMo9hxRgYij_rOonwruuus3O2BQe7U_sNGy_SKL6nmEVDh-DsQL5EOVM34C3-0tcATwEMoaQxUUQ78bAJ6i3HrnLy8NPUg:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(
                            last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    if 'title' in entry:
                        feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    else:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            feedlymap.url = entry['originId'][n:]
                        elif 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Пример #17
0
def load_mail(email_choices, email_address, email_password):
    #server = IMAPClient('imap.kpnmail.nl', use_uid=True)
    server = IMAPClient('imap.deheerlijkekeuken.nl', use_uid=True, ssl=False)
    resp = server.login(email_address, email_password)
    resp = resp.decode()
    if resp != "LOGIN Ok.":
        return False
    select_info = server.select_folder('INBOX')
    print('%d messages in INBOX' % select_info[b'EXISTS'])
    messages = server.search(['ALL'])
    response = server.fetch(messages, ['ENVELOPE', 'RFC822', 'BODY[TEXT]'])
    server.logout()

    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.bypass_tables = False
    bulk_data = []
    count = 0
    total_count = 0
    for msgid, raw_email in response.items():
        envelope = raw_email[b'ENVELOPE']
        post_id = envelope.message_id.decode()
        subject = envelope.subject.decode()
        from_addr = envelope.from_[0].mailbox.decode(
        ) + '@' + envelope.from_[0].host.decode()
        to_addr = envelope.to[0].mailbox.decode(
        ) + '@' + envelope.to[0].host.decode()
        print('ID #%d: "%s" received %s' % (msgid, subject, envelope.date))
        raw_email_string = raw_email[b'RFC822'].decode('utf-8')
        email_message = email.message_from_string(raw_email_string)
        body_text = ""
        # this will loop through all the available multiparts in mail
        for part in email_message.walk():
            if part.get_content_type() == "text/plain":  # ignore attachments
                body = part.get_payload(decode=True)
                body_text = body.decode('utf-8')
                links = set()
            if part.get_content_type() == "text/html":  # ignore attachments
                body = part.get_payload(decode=True)
                body = body.decode('utf-8').strip()
                bs = BeautifulSoup(body, "lxml")
                body_tag = bs.find('body')
                body_text = body_tag.text
                links = mail.get_href_links(subject, bs)
                link_bodies = mail.get_href_link_bodies(links)
                #body = text_maker.handle(body)
                break
        from_index = body_text.find("From:")
        if from_index > 0:
            nl_index = body_text.find("\n", from_index)
            txt = body_text[from_index + 6:nl_index].replace('\r', '')
            from_addr = txt
        sent_index = body_text.find("Sent:")
        if sent_index > 0:
            nl_index = body_text.find("\n", sent_index)
            txt = body_text[sent_index + 5:nl_index].strip().split(' ')
            txt = ' '.join(txt[1:4])
            #conversion fails because of month in local language
            #published_date = datetime.strptime(txt, "%d %B %Y").date()
        to_index = body_text.find("To:")
        if to_index > 0:
            nl_index = body_text.find("\n", to_index)
            txt = body_text[to_index + 3:nl_index]
            to_addr = txt
        subject_index = body_text.find("Subject:")
        if subject_index > 0:
            nl_index = body_text.find("\n", subject_index)
            txt = body_text[subject_index + 9:nl_index]
            subject = txt
        body_text.replace("\r\n", " ")
        body_text.replace("\n", " ")
        body_text.replace("  ", " ")
        mail_doc = models.MailMap()
        mail_doc.post_id = msgid
        mail_doc.to_addr = to_addr
        mail_doc.from_addr = from_addr
        mail_doc.published_date = envelope.date.date()
        #mail_doc.links = [link[0] for link in links]
        mail_doc.links = link_bodies
        mail_doc.subject = subject
        mail_doc.url = ""
        mail_doc.body = body_text
        data = elastic.convert_for_bulk(mail_doc, 'update')
        bulk_data.append(data)
        count = count + 1
        if count > 100:
            bulk(models.client, actions=bulk_data, stats_only=True)
            total_count = total_count + count
            print(
                "load_mail: written another batch, total written {0:d}".format(
                    total_count))
            bulk_data = []
            count = 1

    bulk(models.client, actions=bulk_data, stats_only=True)

    return True
Пример #18
0
def crawl_feedly(from_date, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_date, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2017-10-04)
        "Authorization" : "A2nU8r1LuQ_wUuYHftraCIc0imow9HY7GYB1qxm-OeaU--I-cVt69lCZfEkvsOSX8R9qI6C6ABH5Nq1XKFnKX6JlkY_myGM_hfksTQe4wmWlqRxj-LBQ7n9UhIL1oXfAf80jAVhiz6w8tB9ToYV_YwB47sHASzTMlybx-5bXgmu9gtR-N-FUKByfgihrIjpShy6hMwHYYnKhz73DfQ3JhMCAdAqL1RA:feedlydev"
        }

    params_streams = {
#       "count"     : "100",
        "count"     : "1000",
        "ranked"    : "newest",
        "unreadOnly": "false",
        "newerThan" : newerthan
        }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category, feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(entry['published']/1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    else:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            feedlymap.url = entry['originId'][n:]
                        elif 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],  "lxml") # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(entry['content']['content'], "lxml") # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True