예제 #1
0
def render_document(vnode, expressions, context):
    for expression in expressions:
        evaluation = evaluate_expression(expression, context)
        node = expression.get('node')
        if isinstance(expression.get('value'),
                      basestring) and expression.get('value') == evaluation:
            continue
        expression['value'] = evaluation

        if expression.get('type') == 'each':
            if expression.get('parent'):
                parent = expression.get('parent')
            else:
                parent = node.parent()
                expression['parent'] = parent
            riot_id = node.attr['data-riot-id']
            original_children = parent.children('[data-riot-id="%s"]' %
                                                riot_id)
            # 0. add placeholder
            placeholder = PyQuery('<text></text>')
            placeholder.insertBefore(original_children.eq(0))
            # 1. remove children
            original_node = original_children.clone()
            original_children.remove()
            expression['node'] = original_node
            # 2. insert children
            loopcontext = {}
            loopcontext.update(
                context if isinstance(context, dict) else vars(context))
            expressions_col = []
            for loop_index, item in enumerate(evaluation):
                loopcontext.update(
                    item if isinstance(item, dict) else vars(item))
                loopcontext['loopindex'] = loop_index
                child_node = PyQuery(expression.get('impl'))
                child_node.attr['data-riot-loopindex'] = str(loop_index)
                expressions = parse_document_expressions(child_node)
                expressions_col.append((expressions, loopcontext))
                render_document(vnode, expressions, loopcontext)
                child_node.insertBefore(placeholder)
            # 3. remove placeholder
            if len(evaluation) == 0:
                placeholder.attr['data-riot-id'] = str(riot_id)
            else:
                placeholder.remove()
            mark_dirty(parent)
            generate_widget(parent)
            for expressions, loopcontext in expressions_col:
                connect_signals(vnode, expressions, loopcontext)
            continue
        if expression.get('type') == 'markup':
            node.attr['markup'] = json.dumps(evaluation)
            node.html('')
            mark_dirty(node)
            continue
        if expression.get('type') == 'attribute':
            attribute = expression.get('attribute')
            node.attr[attribute] = str(evaluation)
            mark_dirty(node)
            continue
예제 #2
0
def tweetPaser(tweets_html):
    tweetslist = []
    if tweets_html.strip() != '':
        scraped_tweets = PyQuery(tweets_html)
        scraped_tweets.remove('div.withheld-tweet')
        tweets = scraped_tweets('div.js-stream-tweet')
        if len(tweets) != 0:
            for tweet_html in tweets:
                t = {}
                tweetPQ = PyQuery(tweet_html)
                t['user'] = tweetPQ("span:first.username.u-dir b").text()
                txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text())
                txt = txt.replace('# ', '#')
                txt = txt.replace('@ ', '@')
                t['tweet'] = txt
                t['id'] = tweetPQ.attr("data-tweet-id")
                t['retweets'] = int(
                    tweetPQ(
                        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                t['favorites'] = int(
                    tweetPQ(
                        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                t['link'] = 'https://twitter.com' + tweetPQ.attr(
                    "data-permalink-path")
                t['mentions'] = re.compile('(@\\w*)').findall(t['tweet'])
                t['hashtags'] = re.compile('(#\\w*)').findall(t['tweet'])
                t['timestamp'] = int(
                    tweetPQ("small.time span.js-short-timestamp").attr(
                        "data-time"))
                tweetslist.append(t)
    return tweetslist
예제 #3
0
def getTweets(users, word, lastpost):
    try:
        query = ''
        if word.strip() != '':
            query += word
        if len(users) == 1:
            query += ' from:' + users[0]
        elif len(users) > 1:
            query += ' from:' + ' OR from:'.join(users)
        query = urllib.parse.quote_plus(query)
        url = 'https://twitter.com/i/search/timeline?f=tweets&q={query}&src=typd'.format(
            query=query)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0',
            'Accept': "application/json, text/javascript, */*; q=0.01",
            'Accept-Language': "de,en-US;q=0.7,en;q=0.3",
            'X-Requested-With': "XMLHttpRequest",
            'Referer': url,
            'Connection': "keep-alive"
        }
        response = requests.get(url, headers=headers)
        statuscode = response.status_code
        tweetslist = []
        new_tweets = []
        res = response.json()
        if statuscode == 200:
            json_response = response.json()
            if json_response['items_html'].strip() != '':
                scraped_tweets = PyQuery(json_response['items_html'])
                scraped_tweets.remove('div.withheld-tweet')
                tweets = scraped_tweets('div.js-stream-tweet')
                if len(tweets) != 0:
                    for tweet_html in tweets:
                        t = {}
                        tweetPQ = PyQuery(tweet_html)
                        t['user'] = tweetPQ(
                            "span:first.username.u-dir b").text()
                        txt = re.sub(r"\s+", " ",
                                     tweetPQ("p.js-tweet-text").text())
                        txt = txt.replace('# ', '#')
                        txt = txt.replace('@ ', '@')
                        t['tweet'] = txt
                        t['id'] = tweetPQ.attr("data-tweet-id")
                        t['link'] = tweetPQ.attr("data-permalink-path")
                        t['timestamp'] = int(
                            tweetPQ("small.time span.js-short-timestamp").attr(
                                "data-time"))
                        tweetslist.append(t)
            for tw in tweetslist:
                if tw['id'] == lastpost['id']:
                    break
                if 'timestamp' in tw.keys() and 'timestamp' in lastpost.keys():
                    if tw['timestamp'] < lastpost['timestamp']:
                        break
                new_tweets.append(tw)
        return new_tweets, statuscode
    except:
        return [], -1
    def getTweets(userName, csv, id, proxy=None):
        '''
        Get tweet information from twitter.com
        :param userName: the name of twitter account
        :param csv: the file to written in
        :param proxy: proxy of web
        :return: void
        '''
        e_cursor = ''
        e_cursor_previous = 'none'

        extractedFormattedTweetInfo = []
        cookieJar = http.cookiejar.CookieJar()

        while e_cursor != e_cursor_previous :
            # Pretend to be a human reading a html page and extract current page back in json
            jsonTweet = HtmlHandler.getJsonReponse(userName, e_cursor, cookieJar, proxy, id)
            if len(jsonTweet['items_html'].strip()) == 0:
                break
            # Control the cursor on the html
            e_cursor_previous = e_cursor
            e_cursor = jsonTweet['min_position']
            tweets = PyQuery(jsonTweet['items_html'])

            tweets.remove('div.withheld-tweet')
            tweets = tweets('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetPiece in tweets:
                tweetPQ = PyQuery(tweetPiece)
                tweet = Tweet()

                # Filter correspoding information from html
                tweet.username = id
                tweet.date = datetime.datetime.fromtimestamp(int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")))
                tweet.tweetid = tweetPQ.attr("data-tweet-id")
                tweet.authorid = int(tweetPQ("a.js-user-profile-link").attr("data-user-id"))
                tweet.text = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@'))
                tweet.retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
                tweet.favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
                tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text))
                tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text))
                tweet.permalink = 'https://twitter.com' + tweetPQ.attr("data-permalink-path")
                if len(tweetPQ('span.Tweet-geo')) > 0:
                    tweet.geo = tweetPQ('span.Tweet-geo').attr('title')
                else:
                    tweet.geo = ''

                extractedFormattedTweetInfo.append(tweet)
                print(" Progress: ", end='')
                print(len(extractedFormattedTweetInfo), end='')
                print(" tweets extracted from html.", end='\r')

        # Write what extracted form html page into csv file
        HtmlHandler.writeToCSV(extractedFormattedTweetInfo, csv)
예제 #5
0
def render_document(vnode, expressions, context):
    for expression in expressions:
        evaluation = evaluate_expression(expression, context)
        node = expression.get('node')
        if isinstance(expression.get('value'), basestring) and expression.get('value') == evaluation:
            continue
        expression['value'] = evaluation

        if expression.get('type') == 'each':
            if expression.get('parent'):
                parent = expression.get('parent')
            else:
                parent = node.parent()
                expression['parent'] = parent
            riot_id = node.attr['data-riot-id']
            original_children = parent.children('[data-riot-id="%s"]' % riot_id)
            # 0. add placeholder
            placeholder = PyQuery('<text></text>')
            placeholder.insertBefore(original_children.eq(0))
            # 1. remove children
            original_node = original_children.clone()
            original_children.remove()
            expression['node'] = original_node
            # 2. insert children
            loopcontext = {}
            loopcontext.update(context if isinstance(context, dict) else vars(context))
            expressions_col = []
            for loop_index, item in enumerate(evaluation):
                loopcontext.update(item if isinstance(item, dict) else vars(item))
                loopcontext['loopindex'] = loop_index
                child_node = PyQuery(expression.get('impl'))
                child_node.attr['data-riot-loopindex'] = str(loop_index)
                expressions = parse_document_expressions(child_node)
                expressions_col.append((expressions, loopcontext))
                render_document(vnode, expressions, loopcontext)
                child_node.insertBefore(placeholder)
            # 3. remove placeholder
            if len(evaluation) == 0:
                placeholder.attr['data-riot-id'] = str(riot_id)
            else:
                placeholder.remove()
            mark_dirty(parent)
            generate_widget(parent)
            for expressions, loopcontext in expressions_col:
                connect_signals(vnode, expressions, loopcontext)
            continue
        if expression.get('type') == 'markup':
            node.attr['markup'] = json.dumps(evaluation)
            node.html('')
            mark_dirty(node)
            continue
        if expression.get('type') == 'attribute':
            attribute = expression.get('attribute')
            node.attr[attribute] = str(evaluation)
            mark_dirty(node)
            continue
def parse_json(search_params):
    """
    Parse the json tweet
    :param search_params: SearchParams object
    :return: void
    """
    min_position = get_last_search_position(search_params.log_file_name)
    count = 0
    while True:
        json_res = get_tweets(search_params, min_position)
        if len(json_res['items_html'].strip()) == 0:
            break

        min_position = json_res['min_position']
        search_params.logging.info('min_pos - {}'.format(min_position))
        item = json_res['items_html']
        scraped_tweets = PyQuery(item)
        scraped_tweets.remove('div.withheld-tweet')
        tweets = scraped_tweets('div.js-stream-tweet')

        for tweet_html in tweets:
            print(count)
            tweet_py_query = PyQuery(tweet_html)
            name = tweet_py_query.attr("data-name")
            screen_name = tweet_py_query.attr("data-screen-name")
            tweet_id = tweet_py_query.attr("data-tweet-id")
            tweet_text = re.sub(
                r"\s+", " ",
                tweet_py_query("p.js-tweet-text").text().replace(
                    '# ', '#').replace('@ ', '@'))
            tweet_date_time = int(
                tweet_py_query("small.time span.js-short-timestamp").attr(
                    "data-time"))
            tweet_date_time = datetime.datetime.fromtimestamp(tweet_date_time)
            retweet_count = int(
                tweet_py_query(
                    "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                ).attr("data-tweet-stat-count").replace(",", ""))
            favorites_count = int(
                tweet_py_query(
                    "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                ).attr("data-tweet-stat-count").replace(",", ""))
            permalink = 'https://twitter.com' + tweet_py_query.attr(
                "data-permalink-path")

            tweet = Tweet(str(uuid.uuid4()), name, screen_name, tweet_id,
                          tweet_text, tweet_date_time, retweet_count,
                          favorites_count, permalink)
            # Now Write to OP or save to DB
            write_op(search_params.op, tweet)
            count += 1
        # sleep(5)
        if 0 < search_params.max_retrieval_count <= count:
            break
예제 #7
0
 def _compute_signature(self):
     with file_open("partner_communication_switzerland/static/html/signature.html")\
             as tfile:
         template = PyQuery(tfile.read())
         phone = {
             "fr_CH": "+41 (0)24 434 21 24",
             "de_DE": "+41 (0)31 552 21 21",
             "it_IT": "+41 (0)31 552 21 24",
             "en_US": "+41 (0)31 552 21 25"
         }
         phone_link = {
             "fr_CH": "+41244342124",
             "de_DE": "+41315522121",
             "it_IT": "+41315522124",
             "en_US": "+41315522125"
         }
         facebook = {
             "fr_CH": "https://www.facebook.com/compassionsuisse/",
             "de_DE": "https://www.facebook.com/compassionschweiz/",
             "it_IT": "https://www.facebook.com/compassionsvizzera/",
             "en_US": "https://www.facebook.com/compassionsuisse/"
         }
         for user in self:
             values = {
                 "user":
                 user,
                 "name":
                 f"{user.preferred_name} {user.lastname}"
                 if user.firstname else _("The team of Compassion"),
                 "email":
                 user.email if user.firstname else "*****@*****.**",
                 "lang":
                 self.env.lang,
                 "lang_short":
                 self.env.lang[:2],
                 "team":
                 _("and the team of Compassion") if user.firstname else "",
                 "office_hours":
                 _("mo-thu: 8am-4pm<br/>fri 8am-12am"),
                 "company_name":
                 user.company_id.address_name,
                 "phone_link":
                 phone_link.get(self.env.lang),
                 "phone":
                 phone.get(self.env.lang),
                 "facebook":
                 facebook.get(self.env.lang),
             }
             if self.env.lang in ("fr_CH", "en_US"):
                 template.remove("#bern")
             else:
                 template.remove("#yverdon")
             user.signature = template.html().format(**values)
예제 #8
0
def _get_tweet_batch_html(config, refresh_cursor, cookie_jar, proxy):
    """Scraper
    Identifies the tweets portion from the html json file
    """
    json = _get_json_response(config, refresh_cursor, cookie_jar, proxy)

    if len(json["items_html"].strip()) == 0:
        return
    refresh_cursor = json["min_position"]
    scraped_tweets = PyQuery(json["items_html"])
    # Remove incomplete tweets withheld by Twitter Guidelines
    scraped_tweets.remove("div.withheld-tweet")
    return scraped_tweets("div.js-stream-tweet")
예제 #9
0
    def __processImageTag(self, i, e):
        obj = PyQuery(e)
        style = obj.attr('style')

        if style != None and style.find('display: none') != -1:
            obj.remove()
            return

        newObj = PyQuery("<img />")
        newObj.attr('src', obj.attr('rel:bf_image_src'))
        newObj.attr('style', obj.attr('style'))
        newObj.width(obj.width())
        newObj.height(obj.height())
        obj.replaceWith(newObj)
예제 #10
0
def pq_remove_nodes(
    pq: PyQuery,
    css_remove: Union[str, list],
) -> PyQuery:

    pq = pq.clone()

    if isinstance(css_remove, str):
        css_remove = [css_remove]

    for remove_node in css_remove:
        pq.remove(remove_node)

    return pq
예제 #11
0
    def __processImageTag(self, i, e):
        obj = PyQuery(e)
        style = obj.attr('style')

        if style != None and style.find('display: none') != -1:
            obj.remove()
            return

        newObj = PyQuery("<img />")
        newObj.attr('src', obj.attr('rel:bf_image_src'))
        newObj.attr('style', obj.attr('style'))
        newObj.width(obj.width())
        newObj.height(obj.height())
        obj.replaceWith(newObj)
def get_tweets_for_input(tweet_criteria, query_metadata, should_query_for_city, receive_buffer=None):
    print("Fetching results for :: Query : {0} Language : {1} City : {2} Range : {3} From: {4} Till: {5}".format(
        tweet_criteria['query'], tweet_criteria['language'], tweet_criteria['near'], tweet_criteria['within'],
        tweet_criteria['since'], tweet_criteria['until']))

    buffer_length = 100
    refresh_cursor = ''
    results = []
    results_aux = []

    active = True

    while active:
        try:
            tweet_response_json = Tc.getJsonReponse(tweet_criteria, should_query_for_city, refresh_cursor)
        except Exception as e:
            tweet_response_json = None
            break
        if len(tweet_response_json['items_html'].strip()) == 0:
            break

        refresh_cursor = tweet_response_json['min_position']
        scraped_tweets = PyQuery(tweet_response_json['items_html'])
        # Remove incomplete tweets withheld by Twitter Guidelines
        scraped_tweets.remove('div.withheld-tweet')
        tweets = scraped_tweets('div.js-stream-tweet')

        if len(tweets) == 0:
            break
        temp = Tf.parse_tweet_list_from_tweets_html(tweets)
        results.extend(temp)
        results_aux.extend(temp)
        print("Fetched {0} results from Twitter API. {1} / {2}, has more items: {3}".format(len(temp), len(results),
                                                                                            tweet_criteria[
                                                                                                "max_tweets"],
                                                                                            tweet_response_json[
                                                                                                'has_more_items']))
        # if receive_buffer and len(results_aux) >= buffer_length:
        #     receive_buffer(results_aux, query_metadata, tweet_criteria["output_file_name"])
        #     results_aux = []

        # if len(results) >= tweet_criteria["max_tweets"] >= 0 or tweet_response_json['has_more_items'] is False:
        if len(results) >= tweet_criteria["max_tweets"]:
            active = False

    if receive_buffer and len(results_aux) > 0:
        receive_buffer(results_aux, query_metadata, tweet_criteria["output_file_name"])

    return results
예제 #13
0
    def parse_content():
        exclude_classes = [
            '.article-metaline', '.article-metaline-right', '.push']
        exclude_text_spans = ['發信站: 批踢踢實業坊(ptt.cc)', '文章網址:']

        for exclude_text in exclude_text_spans:
            ele = main.lxml.xpath(
                f'//span[contains(text(),"{exclude_text}")]')[0]
            ele.getparent().remove(ele)
        cleaned_html = etree.tostring(main.lxml)

        cleaned_pq = PyQuery(cleaned_html)
        for exclude_cls in exclude_classes:
            cleaned_pq.remove(exclude_cls)

        return cleaned_pq.text()
예제 #14
0
 def filter_html(self, html, is_body: bool = True):
     """
     过滤html
     """
     try:
         text = re.sub(r"<!-[\s\S]*?-->", "", html)
         doc = PyQuery(text)
         doc.remove("script")
         doc.remove("style")
         if is_body:
             return list(doc("body"))[0]
         else:
             return list(doc("head"))[0]
     except Exception as e:
         self.logger.info(e)
         return None
예제 #15
0
    def getTweets(criteria):
        refreshCursor = ''

        results = []
        resultsAux = []
        cookieJar = cookielib.CookieJar()

        active = True

        while active:
            json = TweetManager.getJsonReponse(criteria, refreshCursor, cookieJar)
            if len(json['items_html'].strip()) == 0:
                break

            refreshCursor = json['min_position']
            scrapedTweets = PyQuery(json['items_html'])
            #Remove incomplete tweets withheld by Twitter Guidelines
            scrapedTweets.remove('div.withheld-tweet')
            tweets = scrapedTweets('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweetPQ = PyQuery(tweetHTML)

                accountId   = int(tweetPQ.attr("data-user-id"))
                accountFullname = tweetPQ.attr("data-name")
                accountHref = "/" + tweetPQ.attr("data-screen-name")
                account = TwitterAccount(accountId, accountFullname, accountHref)

                epoch   = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time"))
                date    = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(epoch))
                likes   = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
                replies = int(tweetPQ("span.ProfileTweet-action--reply span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
                retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
                txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@'))

                tweet = Tweet(account, date, likes, replies, retweets, txt)

                results.append(tweet)

                if criteria['limit'] > 0 and len(results) >= criteria['limit']:
                    active = False
                    break

        return results
예제 #16
0
def sanitize_description(value):
    cleaned = PyQuery(value)
    cleaned = cleaned.remove('span.playMetaText')
    cleaned.remove('span.playMetaText')
    cleaned.remove('span.playCount')
    cleaned.remove('time')
    cleaned.remove('strong')

    desc = cleaned.html()

    if desc is None: return ""

    return desc.split('<span>')[-1:][0].replace('</span>', '').strip()
예제 #17
0
def get_tweet_data(coin_name, max_position):
    cookieJar = http.cookiejar.CookieJar()
    active = True
    proxy = None
    receiveBuffer = None

    refresh_cursor = max_position

    while active:
        json = getJsonReponse(refresh_cursor, cookieJar, proxy)
        if len(json['items_html'].strip()) == 0:
            break

        # 下一页的id游标,每一页有20个
        refresh_cursor = json['min_position']
        print(refresh_cursor)
        scrapedTweets = PyQuery(json['items_html'])
        # Remove incomplete tweets withheld by Twitter Guidelines
        scrapedTweets.remove('div.withheld-tweet')
        tweets = scrapedTweets('div.js-stream-tweet')('div.content')('div.stream-item-footer')(
            'div.ProfileTweet-actionCountList')
        # print("tweets", tweets)

        if len(tweets) == 0:
            break
        for tweetHTML in tweets:
            print(tweetHTML)

            tweetPQ = PyQuery(tweetHTML)

            tweet_id = tweetPQ(
                "span.ProfileTweet-action--reply span.ProfileTweet-actionCount span.ProfileTweet-actionCountForAria").attr(
                "id").split("-")[6]
            print("tweet_id: ", tweet_id)

            reply_num = tweetPQ("span.ProfileTweet-action--reply span.ProfileTweet-actionCount").attr(
                "data-tweet-stat-count")
            print("reply_num: ", reply_num)

            retweet_num = tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr(
                "data-tweet-stat-count")
            print("retweet_num: ", retweet_num)

            favorite_num = tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr(
                "data-tweet-stat-count")
            print("favorite_num: ", favorite_num)
예제 #18
0
 def parseNextPageUrl(self, category_page_content):
     doc = PyQuery(category_page_content)
     nodeAList = doc("span#view_47 > a")
     for nodeA in nodeAList:
         nodeAQ = PyQuery(nodeA)
         if nodeAQ.remove('span').text().strip().lower() == 'next':
             return nodeAQ.attr('href').strip()
     return None
예제 #19
0
 def parseNextPageUrl(self, category_page_content):
     doc = PyQuery(category_page_content)
     nodeAList = doc("span#view_47 > a")
     for nodeA in nodeAList:
         nodeAQ = PyQuery(nodeA)
         if nodeAQ.remove('span').text().strip().lower() == 'next':
             return nodeAQ.attr('href').strip()
     return None
예제 #20
0
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        content_node = doc('div.kb_zw')
        if not content_node:
#            content_node = doc('div.zw_text')
            content_node = PyQuery(self.hxs.select("//div[@class = 'zw_text']").extract()[0])
        
        content_node.remove('script')
        content_node.remove('style')
        content_node.remove('iframe')
        content_node.remove('div[style = "float:left; width:303px; height:250px; display:inline; margin:10px 10px 10px 10px;"]')
        content_node.remove('input')
        
        

        item = ContentItem()
        item['title'] = self.title = doc('td[align = "center"]')('b').text()
        if item['title'] == None:
            item['title'] = self.title = doc('div.zw_bt').text()
        if item['title'] == None:
            item['title'] = self.title = doc('h1.zw_title').text()
        
        
        item['release_time'] = ''
        
        item['source'] = u"新浪"
        item['author'] = ''
        item['pic_url'] = ''

        imgs = content_node('img')
        image_urls = []
        for img in imgs:
            if ".gif" in img.get('src'):
                continue
            if not img.get('src'):
                continue
            else:
                imgs.eq(imgs.index(img)).before('<br>')
                imgs.eq(imgs.index(img)).append('<br>')
                image_urls.append(self.getRealURI(img.get('src')))
        item['image_urls'] = image_urls

        content = content_node.__unicode__()
        item['content'] = self.content = content
        return item
예제 #21
0
def sanitize_description(value):
    cleaned = PyQuery(value)
    cleaned = cleaned.remove('span.playMetaText')
    cleaned.remove('span.playMetaText')
    cleaned.remove('time')
    cleaned.remove('strong')

    return cleaned.html().split('<span>')[-1:][0].replace('</span>', '')
예제 #22
0
def sanitize_html2(value):
    soup = PyQuery(value)
    soup = soup.remove("span.playMetaText")
    soup.remove("span.playMetaText")
    soup.remove("time")
    soup.remove("strong")

    return soup.html().split("<span>")[-1:]
예제 #23
0
def sanitize_description(value):
    cleaned = PyQuery(value)
    cleaned = cleaned.remove('span.playMetaText')
    cleaned.remove('span.playMetaText')
    cleaned.remove('time')
    cleaned.remove('strong')

    return cleaned.html().split('<span>')[-1:][0].replace('</span>', '')
예제 #24
0
def amazon_general_descr(centerCol, pqhtml):
    # print centerCol.outerHtml()
    # print pqhtml.outerHtml()
    descr = centerCol('#featurebullets_feature_div').remove(
        'script').text() or ''
    descr += (pqhtml('#productDescription').remove('script').text() or '')

    if not descr:
        for ele in pqhtml('script[type="text/javascript"]').items():
            if 'ProductDescriptionIframeResize' in ele.text():
                descr = re.search(r'var iframeContent = "(.*)";\n',
                                  ele.text()).groups()[0]
                descr = PyQuery(urllib.unquote(descr))
                descr.remove('script')
                descr = descr('#productDescription').text()
                break
        else:
            raise ValueError, 'Get Descr Fail'

    return descr
예제 #25
0
def feed(request, get_feed=get_feed):
    with shows_db() as shows:
        show_list = shows.values()

    d = PyQuery(get_feed(), parser="xml")

    for item in d("item"):
        ditem = PyQuery(item)
        title = ditem.find("title").text()
        match = detect_show(show_list, title)
        if match:
            name, episode = match
            # TODO: Record episode in the feed so that future versions of this episod will be ignored
        else:
            ditem.remove()

    response = Response()
    response.content_type = "application/rss+xml"
    response.ubody = unicode(d)
    response.cache_control = "no-cache"
    return response
예제 #26
0
def test_KernelResult_repr_html():
    method = "foo"
    alternatives = ["a", "b", "c"]
    rank = [True, False, True]
    extra = {"alfa": 1}

    result = PyQuery(
        KernelResult(method=method,
                     alternatives=alternatives,
                     values=rank,
                     extra=extra)._repr_html_())

    expected = PyQuery("""
        <div class='rankresult'>
        <table id="T_cc7f5_" >
            <thead>
            <tr>
                <th class="blank level0" ></th>
                <th class="col_heading level0 col0" >a</th>
                <th class="col_heading level0 col1" >b</th>
                <th class="col_heading level0 col2" >c</th>
            </tr>
            </thead>
            <tbody>
            <tr>
                <th id="T_cc7f5_level0_row0" class="row_heading level0 row0" >
                    Kernel
                </th>
                <td id="T_cc7f5_row0_col0" class="data row0 col0" >True</td>
                <td id="T_cc7f5_row0_col1" class="data row0 col1" >False</td>
                <td id="T_cc7f5_row0_col2" class="data row0 col2" >True</td>
            </tr>
            </tbody>
        </table>
        <em class='rankresult-method'>Method: foo</em>
        </div>
        """)

    assert result.remove("style").text() == expected.remove("style").text()
예제 #27
0
async def handle_summary(summary: str, rss: rss_class.Rss) -> str:
    # 处理 summary 使其 HTML标签统一,方便处理
    try:
        summary_html = Pq(summary)
    except Exception as e:
        logger.info(f"{rss.name} 没有正文内容! E: {e}")
        return ""
    # 最终消息初始化
    res_msg = ""

    # 判断是否保留转发内容,保留的话只去掉标签,留下里面的内容
    if config.blockquote:
        blockquote_html = summary_html("blockquote")
        for blockquote in blockquote_html.items():
            blockquote.replace_with(blockquote.html())
    else:
        summary_html.remove("blockquote")

    # 判断是否开启了 仅仅推送有图片的信息
    if not rss.only_pic:
        # 处理标签及翻译
        summary_text = await handle_html_tag(html=summary_html)
        # 移除指定内容
        if rss.content_to_remove:
            for pattern in rss.content_to_remove:
                summary_text = re.sub(pattern, "", summary_text)
        res_msg += summary_text
        # 翻译处理后的正文
        if rss.translation:
            res_msg += await handle_translation(content=summary_text)

    # 处理图片
    res_msg += await handle_img(html=summary_html,
                                img_proxy=rss.img_proxy,
                                img_num=rss.max_image_number)

    return res_msg + "\n"
예제 #28
0
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        content_node = doc('.firstTopic')('div')
        content_node.remove('script')
        content_node.remove('.rate')
        content_node.remove('.affixContent')
        content_node.remove('.thread_gold')
        
        
        item = ContentItem()
        imgs = content_node('.p14')('img')
        img_all = []
        for img in imgs:
            if".gif" in img.get('src'):
                continue
            else:  
                imgs.eq(imgs.index(img)).append('<br>')
                imgs.eq(imgs.index(img)).before('<br>')
                img_all.append(self.getRealURI(img.get('src')))
        item['image_urls'] = img_all
        
        item['title'] = self.title = doc('#thread_title').text()
        content = content_node('.p14').__unicode__()
        content = PyQuery(content)
        del_style = content('div')
        for d in del_style:
            if d.get('style'):
                del_style.eq(del_style.index(d)).attr['style'] = ''
                
        content.remove('dl.rate_list')
        content.remove('span[style = "font-size:12px"]')
        content.remove('dl.rate')
        item['content'] = self.content = content.__unicode__()
        
        release_time=doc('.firstTopic')('.postTime').text()
        ob=re.compile(u'20\d\d.*\d\d')
        release_time=ob.findall(release_time)
        
        item['release_time'] = release_time[0]
#        item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(release_time[0],u'%Y-%m-%d %H:%M:%S'))
        item['source'] = u"17173论坛"
        item['author'] = doc('.th1').eq(0).text()
        item['pic_url'] = ''
        
        return item
예제 #29
0
    def get_beer_detail(self, url):
        """
        Follow the link to beer page to get detailed review information.
        """
        r = requests.get(BASE_URL + url)
        pq = PyQuery(r.text)
        pq = pq('#rating_fullview_content_2:first')  # user ratings section
        self.rating = self.clean_xml(pq('.BAscore_norm:first').text())

        # comment is the look/smell/taste/feel/overall appended to any
        # other comments. we remove the other sections so text()
        # return only the comments
        self.comment = pq('.muted:first').text()
        pq.remove('br')
        pq.remove('.muted')
        pq.remove('.BAscore_norm')
        self.comment += "\n" + pq.text()
예제 #30
0
    def get_tweets(tweet_criteria,
                   receive_buffer=None,
                   location_search=False,
                   buffer_length=100,
                   proxy=None):
        refresh_cursor = ''
        results = []
        results_aux = []
        cookiejar = cookielib.CookieJar()

        if hasattr(tweet_criteria, 'username') and (
                tweet_criteria.username.startswith("\'")
                or tweet_criteria.username.startswith("\"")) and (
                    tweet_criteria.username.endswith("\'")
                    or tweet_criteria.username.endswith("\"")):
            tweet_criteria.username = tweet_criteria.username[1:-1]

        active = True

        while active:
            try:
                json = TweetManager.get_json_response(tweet_criteria,
                                                      refresh_cursor,
                                                      cookiejar, proxy)
                if len(json['items_html'].strip()) == 0:
                    break

                refresh_cursor = json['min_position']
                scraped_tweets = PyQuery(json['items_html'])
                # Remove incomplete tweets withheld by Twitter Guidelines
                scraped_tweets.remove('div.withheld-tweet')
                tweets = scraped_tweets('div.js-stream-tweet')

                if len(tweets) == 0:
                    break

                for tweet_html in tweets:
                    tweetPQ = PyQuery(tweet_html)
                    tweet = model.Tweet()

                    username_tweet = tweetPQ(
                        "span:first.username.u-dir b").text()
                    txt = re.sub(r"\s+", " ",
                                 tweetPQ("p.js-tweet-text").text())
                    txt = txt.replace('# ', '#')
                    txt = txt.replace('@ ', '@')

                    print(
                        colored("@" + username_tweet + ": ", "red") +
                        colored(txt, "green") + "\n")

                    retweets = int(
                        tweetPQ(
                            "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    favorites = int(
                        tweetPQ(
                            "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    dateSec = int(
                        tweetPQ("small.time span.js-short-timestamp").attr(
                            "data-time"))
                    id = tweetPQ.attr("data-tweet-id")
                    permalink = tweetPQ.attr("data-permalink-path")
                    user_id = int(
                        tweetPQ("a.js-user-profile-link").attr("data-user-id"))

                    if location_search == True:
                        page = requests.get(
                            'https://twitter.com/tubiity/status/' + id)
                        script_geo = html.fromstring(page.content)
                        location = script_geo.xpath(
                            '//a[@class="u-textUserColor js-nav js-geo-pivot-link"]/text()'
                        )
                        sp_location = ','.join(location)
                        tweet.geo = sp_location
                    else:
                        geo = ''
                        tweet.geo = geo

                        # user-information
                        ''' If this code block is uncommented, application will be slower due to response time'''
                        '''result = requests.get("https://twitter.com/" + username_tweet)
                        c = result.content

                        soup = BeautifulSoup(c, "html.parser")
                        liste = []
                        samples = soup.find_all("a",
                                                    "ProfileNav-stat ProfileNav-stat--link u-borderUserColor u-textCenter js-tooltip js-openSignupDialog js-nonNavigable u-textUserColor")
                            # Follower, Follow and number of likes in list
                        for a in samples:
                            liste.append(a.attrs['title'])
                        '''

                    tweet.id = id
                    tweet.permalink = 'https://twitter.com' + permalink
                    tweet.username = username_tweet
                    tweet.text = txt
                    tweet.date = datetime.datetime.fromtimestamp(dateSec)
                    tweet.retweets = retweets
                    tweet.favorites = favorites
                    tweet.mentions = " ".join(
                        re.compile('(@\\w*)').findall(tweet.text))
                    tweet.hashtags = " ".join(
                        re.compile('(#\\w*)').findall(tweet.text))
                    tweet.user_id = user_id

                    results.append(tweet)
                    results_aux.append(tweet)

                    if receive_buffer and len(results_aux) >= buffer_length:
                        receive_buffer(results_aux)
                        results_aux = []

                    if tweet_criteria.maxTweets > 0 and len(
                            results) >= tweet_criteria.maxTweets:
                        active = False
                        break

            except:
                receive_buffer(results_aux)
                return

        if receive_buffer and len(results_aux) > 0:
            receive_buffer(results_aux)

        return results
예제 #31
0
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  proxy=None):
        refreshCursor = ''

        results = []
        resultsAux = []
        cookieJar = http.cookiejar.CookieJar()

        if hasattr(tweetCriteria, 'username') and (
                tweetCriteria.username.startswith("\'")
                or tweetCriteria.username.startswith("\"")) and (
                    tweetCriteria.username.endswith("\'")
                    or tweetCriteria.username.endswith("\"")):
            tweetCriteria.username = tweetCriteria.username[1:-1]

        active = True

        totalNumTweets = 0

        while active:
            json, fullurl = TweetManager.getJsonReponse(
                tweetCriteria, refreshCursor, cookieJar, proxy)
            if len(json['items_html'].strip()) == 0:
                # print("break")
                break

            refreshCursor = json['min_position']
            # print ("refreshCursor is {}".format(refreshCursor))
            scrapedTweets = PyQuery(json['items_html'])
            #Remove incomplete tweets withheld by Twitter Guidelines
            scrapedTweets.remove('div.withheld-tweet')
            tweets = scrapedTweets('div.js-stream-tweet')

            totalNumTweets += len(tweets)
            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweetPQ = PyQuery(tweetHTML)
                tweet = models.Tweet()

                try:
                    usernameTweet = tweetPQ(
                        "span:first.username.u-dir b").text()
                except Exception as e:
                    usernameTweet = ""
                    print("can not get username")
                try:
                    # get text in different tag seperated by \n
                    tweet_text = tweetPQ("p.js-tweet-text").text(
                        squash_space=False)
                    tweet_text_list = tweet_text.split("\n")
                    # replace the "" with " ", for the \n\n situation
                    for i, v in enumerate(tweet_text_list):
                        if v == "":
                            tweet_text_list[i] = " "
                    txt = "".join(tweet_text_list)
                    # print(" ".join(re.compile('(#\\w*)').findall(txt)))

                except Exception as e:
                    txt = ""
                    print("can not get txt")
                    traceback.print_exc()

                try:
                    reply = int(
                        tweetPQ(
                            "span.ProfileTweet-action--reply span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                except Exception as e:
                    reply = 0
                    print("can not get retweets.")
                    traceback.print_exc()

                try:
                    retweets = int(
                        tweetPQ(
                            "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                except Exception as e:
                    retweets = 0
                    print("can not get retweets")
                    traceback.print_exc()

                try:
                    favorites = int(
                        tweetPQ(
                            "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                except Exception as e:
                    favorites = 0
                    print("can not get retweets.")
                    traceback.print_exc()

                try:
                    dateSec = int(
                        tweetPQ("small.time span.js-short-timestamp").attr(
                            "data-time"))
                except Exception as e:
                    dateSec = 0
                    print("can not get dateSec")
                    traceback.print_exc()

                try:
                    idx = tweetPQ.attr("data-tweet-id")
                except Exception as e:
                    idx = ""
                    print("can not get id")
                    traceback.print_exc()

                try:
                    permalink = tweetPQ.attr("data-permalink-path")
                except Exception as e:
                    permalink = ""
                    print("can not get permalink")
                    traceback.print_exc()

                try:
                    url = tweetPQ('a.twitter-timeline-link').attr(
                        'data-expanded-url')
                except Exception as e:
                    url = ""
                    print("can not get url")
                    traceback.print_exc()

                tweet.url = url
                # hashtag
                try:
                    hashtags = tweetPQ(
                        'a.twitter-hashtag.pretty-link.js-nav').text().replace(
                            "# ", "#")
                except Exception as e:
                    hashtags = ""
                    traceback.print_exc()

                tweet.hashtags = hashtags.replace('\n', '')

                geo = ''
                try:
                    geoSpan = tweetPQ('span.Tweet-geo')
                    if len(geoSpan) > 0:
                        geo = geoSpan.attr('title')
                except Exception as e:
                    geo = ''

                tweet.id = idx
                tweet.permalink = 'https://twitter.com' + permalink
                tweet.username = usernameTweet
                tweet.text = txt
                tweet.date = datetime.datetime.fromtimestamp(dateSec)
                tweet.reply = reply
                tweet.retweets = retweets
                tweet.favorites = favorites
                tweet.mentions = " ".join(
                    re.compile('(@\\w*)').findall(tweet.text))
                # tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text))
                tweet.geo = geo

                results.append(tweet)
                resultsAux.append(tweet)

                if receiveBuffer and len(resultsAux) >= bufferLength:
                    receiveBuffer(resultsAux)
                    resultsAux = []

                if tweetCriteria.maxTweets > 0 and len(
                        results) >= tweetCriteria.maxTweets:
                    active = False
                    break

        print("url: {}".format(fullurl))

        if receiveBuffer and len(resultsAux) > 0:
            receiveBuffer(resultsAux)

        return results, totalNumTweets
	def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None):
		refreshCursor = ''
	
		results = []
		resultsAux = []
		cookieJar = http.cookiejar.CookieJar()

		active = True

		while active:
			json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy)
			if len(json['items_html'].strip()) == 0:
				break

			refreshCursor = json['min_position']
			scrapedTweets = PyQuery(json['items_html'])
			#Remove incomplete tweets withheld by Twitter Guidelines
			scrapedTweets.remove('div.withheld-tweet')
			tweets = scrapedTweets('div.js-stream-tweet')
			
			if len(tweets) == 0:
				break
			
			for tweetHTML in tweets:
				tweetPQ = PyQuery(tweetHTML)
				tweet = models.Tweet()
				
				usernameTweet = tweetPQ("span.username.js-action-profile-name b").text()
				txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@'))
				retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
				favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
				dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time"))
				id = tweetPQ.attr("data-tweet-id")
				permalink = tweetPQ.attr("data-permalink-path")
				user_id = int(tweetPQ("a.js-user-profile-link").attr("data-user-id"))
				
				geo = ''
				geoSpan = tweetPQ('span.Tweet-geo')
				if len(geoSpan) > 0:
					geo = geoSpan.attr('title')
				urls = []
				for link in tweetPQ("a"):
					try:
						urls.append((link.attrib["data-expanded-url"]))
					except KeyError:
						pass
				tweet.id = id
				tweet.permalink = 'https://twitter.com' + permalink
				tweet.username = usernameTweet
				
				tweet.text = txt
				tweet.date = datetime.datetime.fromtimestamp(dateSec)
				tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec).strftime("%a %b %d %X +0000 %Y")
				tweet.retweets = retweets
				tweet.favorites = favorites
				tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text))
				tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text))
				tweet.geo = geo
				tweet.urls = ",".join(urls)
				tweet.author_id = user_id
				
				results.append(tweet)
				resultsAux.append(tweet)
				
				if receiveBuffer and len(resultsAux) >= bufferLength:
					receiveBuffer(resultsAux)
					resultsAux = []
				
				if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets:
					active = False
					break
					
		
		if receiveBuffer and len(resultsAux) > 0:
			receiveBuffer(resultsAux)
		
		return results
 def extract_content(self, html):
     html = re.sub(r'xmlns="[^"]+"', "", html)
     doc = PyQuery(html)
     content_node = doc.find(self.content_css_selector)
     self.should_remove_css_selector and doc.remove(self.should_remove_css_selector)
     return content_node.outer_html()
예제 #34
0
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  proxy=None):
        refreshCursor = ''

        results = []
        resultsAux = []
        cookieJar = cookielib.CookieJar()

        if hasattr(tweetCriteria, 'username') and (
                tweetCriteria.username.startswith("\'")
                or tweetCriteria.username.startswith("\"")) and (
                    tweetCriteria.username.endswith("\'")
                    or tweetCriteria.username.endswith("\"")):
            tweetCriteria.username = tweetCriteria.username[1:-1]

        active = True

        while active:
            json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
                                               cookieJar, proxy)
            if len(json['items_html'].strip()) == 0:
                break

            refreshCursor = json['min_position']
            scrapedTweets = PyQuery(json['items_html'])
            #Remove incomplete tweets withheld by Twitter Guidelines
            scrapedTweets.remove('div.withheld-tweet')
            tweets = scrapedTweets('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweetPQ = PyQuery(tweetHTML)
                tweet = models.Tweet()

                usernameTweet = tweetPQ("span:first.username.u-dir b").text()
                txt = re.sub(
                    r"\s+", " ",
                    tweetPQ("p.js-tweet-text").text().replace('# ',
                                                              '#').replace(
                                                                  '@ ', '@'))
                retweets = int(
                    tweetPQ(
                        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                favorites = int(
                    tweetPQ(
                        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                dateSec = int(
                    tweetPQ("small.time span.js-short-timestamp").attr(
                        "data-time"))
                id = tweetPQ.attr("data-tweet-id")
                permalink = tweetPQ.attr("data-permalink-path")

                geo = ''
                geoSpan = tweetPQ('span.Tweet-geo')
                if len(geoSpan) > 0:
                    geo = geoSpan.attr('title')

                tweet.id = id
                tweet.permalink = 'https://twitter.com' + permalink
                tweet.username = usernameTweet
                tweet.text = txt
                tweet.date = datetime.datetime.fromtimestamp(dateSec)
                tweet.retweets = retweets
                tweet.favorites = favorites
                tweet.mentions = " ".join(
                    re.compile('(@\\w*)').findall(tweet.text))
                tweet.hashtags = " ".join(
                    re.compile('(#\\w*)').findall(tweet.text))
                tweet.geo = geo

                results.append(tweet)
                resultsAux.append(tweet)

                if receiveBuffer and len(resultsAux) >= bufferLength:
                    receiveBuffer(resultsAux)
                    resultsAux = []

                if tweetCriteria.maxTweets > 0 and len(
                        results) >= tweetCriteria.maxTweets:
                    active = False
                    break

        if receiveBuffer and len(resultsAux) > 0:
            receiveBuffer(resultsAux)

        return results
예제 #35
0
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  proxy=None,
                  debug=False):
        """Get tweets that match the tweetCriteria parameter
        A static method.

        Parameters
        ----------
        tweetCriteria : tweetCriteria, an object that specifies a match criteria
        receiveBuffer : callable, a function that will be called upon a getting next `bufferLength' tweets
        bufferLength: int, the number of tweets to pass to `receiveBuffer' function
        proxy: str, a proxy server to use
        debug: bool, output debug information
        """
        results = []
        resultsAux = []
        cookieJar = http.cookiejar.CookieJar()
        user_agent = random.choice(TweetManager.user_agents)

        all_usernames = []
        usernames_per_batch = 20

        if hasattr(tweetCriteria, 'username'):
            if type(tweetCriteria.username) == str or not hasattr(
                    tweetCriteria.username, '__iter__'):
                tweetCriteria.username = [tweetCriteria.username]

            usernames_ = [u.lstrip('@') for u in tweetCriteria.username if u]
            all_usernames = sorted({u.lower() for u in usernames_ if u})
            n_usernames = len(all_usernames)
            n_batches = n_usernames // usernames_per_batch + (
                n_usernames % usernames_per_batch > 0)
        else:
            n_batches = 1

        for batch in range(n_batches):  # process all_usernames by batches
            refreshCursor = ''
            batch_cnt_results = 0
            sleep(1)
            if all_usernames:  # a username in the criteria?
                tweetCriteria.username = all_usernames[
                    batch * usernames_per_batch:batch * usernames_per_batch +
                    usernames_per_batch]

            active = True
            while active:
                json = TweetManager.getJsonResponse(tweetCriteria,
                                                    refreshCursor,
                                                    cookieJar,
                                                    proxy,
                                                    user_agent,
                                                    debug=debug)
                if len(json['items_html'].strip()) == 0:
                    break

                refreshCursor = json['min_position']
                scrapedTweets = PyQuery(json['items_html'])
                #Remove incomplete tweets withheld by Twitter Guidelines
                scrapedTweets.remove('div.withheld-tweet')
                tweets = scrapedTweets('div.js-stream-tweet')

                if len(tweets) == 0:
                    break

                for tweetHTML in tweets:
                    tweetPQ = PyQuery(tweetHTML)
                    tweet = models.Tweet()

                    usernames = tweetPQ("span.username.u-dir b").text().split()
                    if not len(usernames):  # fix for issue #13
                        continue

                    tweet.username = usernames[0]
                    tweet.to = usernames[1] if len(
                        usernames
                    ) >= 2 else None  # take the first recipient if many
                    rawtext = TweetManager.textify(
                        tweetPQ("p.js-tweet-text").html(), tweetCriteria.emoji)
                    tweet.text = re.sub(r"\s+", " ", rawtext)\
                        .replace('# ', '#').replace('@ ', '@').replace('$ ', '$')
                    tweet.retweets = int(
                        tweetPQ(
                            "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    tweet.favorites = int(
                        tweetPQ(
                            "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    tweet.replies = int(
                        tweetPQ(
                            "span.ProfileTweet-action--reply span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    tweet.id = tweetPQ.attr("data-tweet-id")
                    tweet.permalink = 'https://twitter.com' + tweetPQ.attr(
                        "data-permalink-path")
                    tweet.author_id = int(
                        tweetPQ("a.js-user-profile-link").attr("data-user-id"))

                    dateSec = int(
                        tweetPQ("small.time span.js-short-timestamp").attr(
                            "data-time"))
                    tweet.date = datetime.datetime.fromtimestamp(
                        dateSec, tz=datetime.timezone.utc)
                    tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec, tz=datetime.timezone.utc)\
                                                            .strftime("%a %b %d %X +0000 %Y")
                    tweet.hashtags, tweet.mentions = TweetManager.getHashtagsAndMentions(
                        tweetPQ)

                    geoSpan = tweetPQ('span.Tweet-geo')
                    if len(geoSpan) > 0:
                        tweet.geo = geoSpan.attr('title')
                    else:
                        tweet.geo = ''

                    urls = []
                    for link in tweetPQ("a"):
                        try:
                            urls.append((link.attrib["data-expanded-url"]))
                        except KeyError:
                            pass

                    tweet.urls = ",".join(urls)

                    results.append(tweet)
                    resultsAux.append(tweet)

                    if receiveBuffer and len(resultsAux) >= bufferLength:
                        receiveBuffer(resultsAux)
                        resultsAux = []

                    batch_cnt_results += 1
                    if tweetCriteria.maxTweets > 0 and batch_cnt_results >= tweetCriteria.maxTweets:
                        active = False
                        break

            if receiveBuffer and len(resultsAux) > 0:
                receiveBuffer(resultsAux)
                resultsAux = []

        return results
예제 #36
0
파일: agent.py 프로젝트: johnjansen/Magpie
class MagpieAgentThread( threading.Thread ):

    def __init__( self, thread_id ):
        threading.Thread.__init__( self )
        self.thread_id = thread_id
        self.mongo = MongoClient( ).katipo.pages

    def next_job( self ):
        data = stored_procedure_as_dict( "next_job" )
        if len( data ) > 0:
            self.current_job = data
            return True
        else:
            self.current_job = None
            return False

    def process( self ):
        self.discovered_urls = set()
        self.basic_content_type = "text/html"
        self.message_stack = [ "-" * 80 ]

        # resolve the address
        uri = urlparse( self.current_job[ 'url' ] )
        answers = dns.resolver.query( uri.hostname, 'A' )
        for answer in answers:
            self.message_stack.append( "DNS) %s" % answer )

        try:
            self.current_response = requests.get( self.current_job[ 'url' ], stream=True )
            self.basic_content_type = self.current_response.headers[ 'content-type' ].split( ";" )[ 0 ]
        except:
            self.current_response = None
            self.basic_content_type = None

        if self.current_response:
            for r in self.current_response.history:
                self.message_stack.append( "-URL (%s) %s" % ( r.status_code, r.url ) )

            self.message_stack.append( "+URL (%s) %s" % ( self.current_response.status_code, self.current_response.url ) )
            self.message_stack.append( "BASIC CONTENT-TYPE) %s" % self.basic_content_type )
            self.message_stack.append( "CONTENT TYPE) %s" % self.current_response.headers['content-type'] )
            self.message_stack.append( "ENCODING) %s" % self.current_response.encoding )

            if self.basic_content_type in ACCEPTABLE_CONTENT_TYPES:
                # we need to handle the odd, but real case of the mystery <? palantir_blog_list('sidebar') ?> tag
                # tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text )
                tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text )
                tidy_response_text = re.sub( "<!--.*?-->", "", tidy_response_text )

                self.dom = PyQuery( tidy_response_text, parser='html' )

                self.titles = [ safe_str( title.text ) for title in self.dom("title") ]

                for a in self.dom('a'):
                    a = PyQuery(a)
                    new_url = PyQuery(a).attr.href
                    if new_url != None:
                        new_url = urldefrag( urljoin( self.current_response.url, new_url ) )[0]
                        self.discovered_urls.add( new_url )

                self.message_stack.append( "DISCOVERED) %s" % len( self.discovered_urls ) )

                # BOILERPIPE
                for excluded_tag in BOILERPIPE_REMOVE_TAGS:
                    self.dom( excluded_tag ).after( "\n" )
                    self.dom.remove( excluded_tag )

                # remove tags with style="display:none"
                # http://www.microsoft.com/en-us/legal/intellectualproperty/copyright/default.aspx          
                display_none_pattern = re.compile( "display: ?none" )

                for x in self.dom("*"):
                    try:
                        tag = PyQuery(x)
                        if not tag.attr("style") == None:
                            if re.match( display_none_pattern, tag.attr("style") ):
                                tag.remove()
                    except Exception as inst:
                        print type(inst)
                        print inst.args
                        print inst

                self.save()
            else:
                self.message_stack.append( "DISCARDED" )
        else:
            self.message_stack.append( "NO RESPONSE" )

    def save( self ):
        try:
            # if domain( self.current_response.url ) == domain( self.current_job['url'] ):
            scraper_name = SCRAPER_NAME
            if 'scraper_name' in self.current_job:
                scraper_name =  self.current_job[ 'scraper_name' ].lower()

            scraper_version = SCRAPER_VERSION
            if 'scraper_version' in self.current_job:
                scraper_version = self.current_job[ 'scraper_version' ]

            qid_base = "%s:%s:%s" % ( self.current_job[ 'starting_point' ], self.current_job[ 'url' ], scraper_name )

            qid_hash = hashlib.md5()
            qid_hash.update( qid_base )
            qid = qid_hash.hexdigest()

            headers = self.current_response.headers
            scraped_at = datetime.utcnow().strftime( r'%Y-%m-%dT%H:%M:%SZ' )
            last_modified = scraped_at
            if 'date' in headers:
                last_modified = _parse_http_datetime( headers['date'] ).strftime( r'%Y-%m-%dT%H:%M:%SZ' )

            quid_orgid = None
            if 'org_id' in self.current_job:
                quid_orgid = self.current_job[ 'org_id' ]

            alternative_urls = []
            if self.current_response.history != None:
                for x in self.current_response.history:
                    alternative_urls.append( x.url )

            content_type = "text/html"
            if "content-type" in headers:
                content_type = headers['content-type']

            content_type = content_type.split( ";" )[ 0 ]

            url_selection_rule = "(page_count < %s) && (depth <= %s)" % ( self.current_job[ 'page_limit' ], self.current_job[ 'depth_limit' ] )
            
            if len( alternative_urls ) > 0:
                print qid
                
            # sample tokenize
            tokens = []
            for token in re.compile( "\W", re.UNICODE).split( safe_unicode( self.dom.text().lstrip().rstrip() ) ):
                if token.lstrip().rstrip() != '':
                    tokens.append( token )

            # find any docs which already exist with this url
            matches = []
            for d in self.mongo.find( { "meta.data.source_urls" : self.current_response.url } ):
                matches.append( d["_id"] )

            print( "matches for %s\n%s" % ( self.current_response.url, matches ) )

            self.mongo.insert( 
                {
                   u"meta" : {
                       u"data" : {
                           u"qid" : qid,
                           u"qid_base" : qid_base,
                           u"content_encoding" : u"UTF-8",
                           u"content_type" : content_type,
                           u"source_url" : alternative_urls + [ self.current_response.url ], 
                           u"doc_type_name" : u"unstructured/web/WEBPAGE",
                           u"doc_type_version" : u"1.0.0",
                           u"scraper_name" : scraper_name,
                           u"scraper_version" : scraper_version,
                           u"scraped_at" : scraped_at,
                           u"date_publication" : last_modified,
                           u"quid_orgid" : quid_orgid,
                           u"katipo" : {
                               u"starting_url" : self.current_job[ 'starting_point' ],
                               u"last_modified" : last_modified,
                               u"domain" : self.current_job[ 'domain' ]
                           }
                        },
                       u"v" : 2,
                       u"id" : qid
                    },
                   u"raw" : {
                       u"data" : self.current_response.text,
                       u"v" : 2,
                       u"id" : qid
                    },
                   u"structured" : {
                       u"data" : {
                           u"http_headers" : headers,
                           u"page_depth" : self.current_job[ 'depth' ],
                           u"job_count" : self.current_job[ 'job_count' ],
                           u"url_selection_rule" : url_selection_rule,
                           u"meta_tags" : u'',
                           u"body_text" : self.dom.text().lstrip().rstrip(),
                           u"tokens" : tokens
                        },
                       u"v" : 2,
                       u"id" : qid
                    }
                }
            )

        except Exception as inst:
            self.message_stack.append( inst )
            # self.message_stack.append( self.dom.text() )
            print string.join( self.message_stack, "\n" )


    def acknowledge( self ):
        stored_procedure( "acknowledge_job", self.current_job['domain'], self.current_job['url'] )
        for url in self.discovered_urls:
            new_url_scheme = urlsplit( url )[0]
            if domain(url) == domain( self.current_job['url'] ) and new_url_scheme in ACCEPTABLE_SCHEMES:
                self.message_stack.append( "ACCEPT) %s"  % url )
                self.queue( url )
            else:
                self.message_stack.append( "REJECT) %s"  % url )
        self.message_stack.append( "ACK'd) %s" % self.current_job['url'] )

    def fail( self ):
        self.message_stack.append( "FAIL" )
        # self.acknowledge( )
        pass

    def queue( self, url ):
        j, d = self.current_job, ( int( self.current_job['depth'] ) + 1 )
        stored_procedure( "queue_job", j['domain'], j['page_limit'], j['depth_limit'], url, d, j['url'], j['starting_point'], j['batch'], j['org_id'] )


    def run( self ):
        for x in range( 3600 ):
            while self.next_job():
                try:
                    self.process()
                    self.acknowledge( )
                except Exception as inst:
                    print inst
                    self.fail()

                # print string.join( self.message_stack, "\n" )

            sleep( 1 )
    def getTweets(tweet_criteria,
                  receive_buffer=None,
                  buffer_length=100,
                  proxy=None):
        refresh_cursor = ''

        results = []
        results_aux = []
        cookie_jar = http.cookiejar.CookieJar()

        active = True

        while active:
            json = TweetManager.getJsonResponse(tweet_criteria, refresh_cursor,
                                                cookie_jar, proxy)

            if len(json['items_html'].strip()) == 0:
                break

            refresh_cursor = json['min_position']
            scraped_tweets = PyQuery(json['items_html'])
            scraped_tweets.remove('div.withheld-tweet')
            tweets = scraped_tweets('div.js-stream-tweet')

            if len(tweets) == 0:
                break
            for tweet_HTML in tweets:
                tweet_PQ = PyQuery(tweet_HTML)
                tweet = TweetModel()

                username = tweet_PQ.attr("data-screen-name")
                text = re.sub(
                    r"\s+", " ",
                    tweet_PQ("p.js-tweet-text").text().replace('# ',
                                                               '#').replace(
                                                                   '@ ', '@'))
                retweets = int(
                    tweet_PQ(
                        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                likes = int(
                    tweet_PQ(
                        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                date = int(
                    tweet_PQ("small.time span.js-short-timestamp").attr(
                        "data-time"))
                id = tweet_PQ.attr("data-tweet-id")
                permalink = tweet_PQ.attr("data-permalink-path")
                user_id = int(
                    tweet_PQ("a.js-user-profile-link").attr("data-user-id"))
                media = tweet_PQ("div.AdaptiveMedia-photoContainer").attr(
                    "data-image-url")
                geo = ''
                geo_span = tweet_PQ('span.Tweet-geo')
                if len(geo_span) > 0:
                    geo = geo_span.attr('titile')
                urls = []
                for link in tweet_PQ("a"):
                    try:
                        urls.append((link.attrib["data-expanded-url"]))
                    except KeyError:
                        pass
                tweet.id = id
                tweet.source = 'https://twitter.com' + permalink
                tweet.username = username
                tweet.text = text
                tweet.created_at = date
                tweet.retweet_count = retweets
                tweet.favorite_count = likes
                tweet.mentions = " ".join(
                    re.compile('(@\\w*)').findall(tweet.text))
                tweet.hashtags = " ".join(
                    re.compile('(#\\w*)').findall(tweet.text))
                tweet.geo = geo
                tweet.urls = ",".join(urls)
                tweet.author_id = user_id
                tweet.media = media
                results.append(tweet)
                results_aux.append(tweet)

                if receive_buffer and len(results_aux) >= buffer_length:
                    receive_buffer(results_aux)
                    results_aux = []

                if tweet_criteria.max_tweets > 0 and len(
                        results) > tweet_criteria.max_tweets:
                    active = False
                    break

        if receive_buffer and len(results_aux) > 0:
            receive_buffer(results_aux)

        return results
예제 #38
0
def faltantes():
    bajados = {int(l.split('.')[0]) - 1 for l in glob.glob('*.md')}
    links = get_all_links()
    faltan = set(range(len(links))) - bajados
    return [links[i] if i in faltan else None for i in range(len(links))]


for did, url in enumerate(faltantes()):
    if not url:
        continue
    try:
        d = PyQuery(url=url, headers=headers)

        # cleanups
        d.remove('ul.actions, #fb-root, script, div[style="clear:both"]')
        for cf in d('.clearfix'):
            if d(cf).text() == "":
                d(cf).remove()

        fecha = d('dd.published').text()
        d('.article-info').before(u'<p>[{}]</p>'.format(fecha))
        d.remove('.article-info')

        # no link in the title
        titulo = d('.item-page h2 a').text().decode('utf8')
        d('.item-page h2').text(titulo)

        # clean html content
        discurso = d('.item-page').html()
        import ipdb;ipdb.set_trace()
예제 #39
0
def strip_tags(text):
    html = PyQuery(text)
    return html.remove('code').remove('a').text()
예제 #40
0
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        doc.remove('div#tipswindow')
        content_node = doc('div#Cnt-Main-Article-QQ')
        if not content_node:
            content_node = doc('div#ArticleCnt')
        if not content_node:
            content_node = doc('div#textContent')
        if not content_node:
            content_node = doc('#content')
        if not content_node:
            content_node = doc('div[id = "qnews-content"]')
            
        content_node.remove('script')
        content_node.remove('style')
        content_node.remove('iframe')
        content_node.remove('div.adpip_Aritcle_QQ')
        content_node.remove('table#picInPic')
        content_node.remove('div.dayuw_ad')
        content_node.remove('div.tJieHot_')
        content_node.remove('div.b_new_mod')
        content_node.remove('div#awh_sports')
        content_node.remove('div[id = "photo-warp"]')
        content_node.remove('div#MorePic')
        content_node.remove('div#cmenu')
        content_node.remove('div#flashCff')
        content_node.remove('div#contTxt')
        content_node.remove('div#PGViframe')
        content_node.remove('div#Reading')
        content_node.remove('span[style = "BACKGROUND-COLOR: navy; COLOR: white"]')
        content_node.remove('img[width="592"][height="100"]')

        content = content_node.__unicode__()

        item = ContentItem()
        
        item['title'] = self.title = doc('h1').text()
        if not item['title']:
            item['title'] = self.title = doc('div#ArticleTit').text()
        if not item['title']:
            item['title'] = self.title = doc('h2').text()
            
        item['content'] = self.content = content
        
        item['release_time'] = self.release_time = doc('span.pubTime').text()
        p = re.compile(u"(20\d\d.*\d\d:\d\d)")

        if not self.release_time:
            self.release_time = doc('div[class = "info"]').text()
            if self.release_time == None:
                self.release_time = doc('div[id = "ArtFrom"]').text()
            if self.release_time == None:
                self.release_time = doc('div[class = "pubtime"]').text()
            if self.release_time == None:
                self.release_time = doc('span[id= "Freleasetime"]').text()
            if self.release_time == None:
                self.release_time = doc('td.xborderb1').eq(1).text()
                p = re.compile(u"(20.*-\d\d)")

                
            item['release_time'] = self.release_time = p.search(self.release_time).group()
        #item['release_switch_time'] = time.mktime(time.strptime(self.release_time,time_s))
            
        item['source'] = u"腾讯"
        item['author'] = ''
        item['pic_url'] = ''

        imgs = content_node('img')
        image_urls = []
        for img in imgs:
            if ".gif" in img.get('src'):
                continue
            if not img.get('src'):
                continue
            else:
                imgs.eq(imgs.index(img)).before('<br>')
                image_urls.append(self.getRealURI(img.get('src')))
        item['image_urls'] = image_urls

        return item
예제 #41
0
파일: hbx.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:

            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.GET_ERR.get(
                                              'SCERR', 'ERROR'),
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            JscriptTxt = pqhtml('script').text()

            pqhtml.remove('script').remove('style')

            area = pqhtml('div#product-summary')

            # print area.outerHtml().encode('utf-8')

            buttonTxt = area('#product-form .add-button').text()

            if u'售罄' in buttonTxt.lower() or u'sold out' in buttonTxt.lower():

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.GET_ERR.get(
                                              'SCERR', 'ERROR'),
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            detail = dict()

            #所有图片
            imgs = self.get_imgs(pqhtml)
            detail['imgs'] = imgs
            detail['img'] = imgs[0]

            #名称
            detail['name'] = area('h1.brand').text() + ' ' + area(
                '.name').text()

            #货币
            currency = area('span.regular-price').text().split()[0]
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_all_price(area)
            detail['price'] = price
            detail['listPrice'] = listPrice

            color, sizes = self.get_sizes(area)

            #颜色
            detail['color'] = color

            #sizes
            detail['sizes'] = sizes

            #下架:
            if isinstance(detail['sizes'],
                          basestring) and detail['sizes'] == 'sold out':

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #描述
            detail['descr'] = area('div#description').text() or pqhtml(
                '#product-details .product-details-section').text()

            #品牌
            detail['brand'] = area('h1.brand').text()

            #产品ID
            prodId = area.attr('data-id')
            detail['productId'] = prodId
            detail['colorId'] = prodId

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            #返回的IP和端口
            if resp.raw._original_response.peer:
                detail['ip_port'] = ':'.join(
                    map(lambda x: str(x), resp.raw._original_response.peer))

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #42
0
파일: agent.py 프로젝트: johnjansen/Magpie
    def process( self ):
        self.discovered_urls = set()
        self.basic_content_type = "text/html"
        self.message_stack = [ "-" * 80 ]

        # resolve the address
        uri = urlparse( self.current_job[ 'url' ] )
        answers = dns.resolver.query( uri.hostname, 'A' )
        for answer in answers:
            self.message_stack.append( "DNS) %s" % answer )

        try:
            self.current_response = requests.get( self.current_job[ 'url' ], stream=True )
            self.basic_content_type = self.current_response.headers[ 'content-type' ].split( ";" )[ 0 ]
        except:
            self.current_response = None
            self.basic_content_type = None

        if self.current_response:
            for r in self.current_response.history:
                self.message_stack.append( "-URL (%s) %s" % ( r.status_code, r.url ) )

            self.message_stack.append( "+URL (%s) %s" % ( self.current_response.status_code, self.current_response.url ) )
            self.message_stack.append( "BASIC CONTENT-TYPE) %s" % self.basic_content_type )
            self.message_stack.append( "CONTENT TYPE) %s" % self.current_response.headers['content-type'] )
            self.message_stack.append( "ENCODING) %s" % self.current_response.encoding )

            if self.basic_content_type in ACCEPTABLE_CONTENT_TYPES:
                # we need to handle the odd, but real case of the mystery <? palantir_blog_list('sidebar') ?> tag
                # tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text )
                tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text )
                tidy_response_text = re.sub( "<!--.*?-->", "", tidy_response_text )

                self.dom = PyQuery( tidy_response_text, parser='html' )

                self.titles = [ safe_str( title.text ) for title in self.dom("title") ]

                for a in self.dom('a'):
                    a = PyQuery(a)
                    new_url = PyQuery(a).attr.href
                    if new_url != None:
                        new_url = urldefrag( urljoin( self.current_response.url, new_url ) )[0]
                        self.discovered_urls.add( new_url )

                self.message_stack.append( "DISCOVERED) %s" % len( self.discovered_urls ) )

                # BOILERPIPE
                for excluded_tag in BOILERPIPE_REMOVE_TAGS:
                    self.dom( excluded_tag ).after( "\n" )
                    self.dom.remove( excluded_tag )

                # remove tags with style="display:none"
                # http://www.microsoft.com/en-us/legal/intellectualproperty/copyright/default.aspx          
                display_none_pattern = re.compile( "display: ?none" )

                for x in self.dom("*"):
                    try:
                        tag = PyQuery(x)
                        if not tag.attr("style") == None:
                            if re.match( display_none_pattern, tag.attr("style") ):
                                tag.remove()
                    except Exception as inst:
                        print type(inst)
                        print inst.args
                        print inst

                self.save()
            else:
                self.message_stack.append( "DISCARDED" )
        else:
            self.message_stack.append( "NO RESPONSE" )
예제 #43
0
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  proxy=None):
        refreshCursor = ''

        results = []
        resultsAux = []
        cookieJar = cookielib.CookieJar()

        if hasattr(tweetCriteria, 'username') and (
                tweetCriteria.username.startswith("\'")
                or tweetCriteria.username.startswith("\"")) and (
                    tweetCriteria.username.endswith("\'")
                    or tweetCriteria.username.endswith("\"")):
            tweetCriteria.username = tweetCriteria.username[1:-1]

        active = True

        while active:
            json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
                                               cookieJar, proxy)
            if len(json['items_html'].strip()) == 0:
                break

            refreshCursor = json['min_position']
            scrapedTweets = PyQuery(json['items_html'])
            #Remove incomplete tweets withheld by Twitter Guidelines
            scrapedTweets.remove('div.withheld-tweet')
            tweets = scrapedTweets('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweetPQ = PyQuery(tweetHTML)
                tweet = models.Tweet()

                txt = re.sub(
                    r"\s+", " ",
                    tweetPQ("p.js-tweet-text").text().replace('# ',
                                                              '#').replace(
                                                                  '@ ', '@'))
                try:
                    dateSec = int(
                        tweetPQ("small.time span.js-short-timestamp").attr(
                            "data-time"))
                except:
                    dateSec = int('1575158200')

                tweet.text = txt
                tweet.date = datetime.datetime.fromtimestamp(dateSec)

                results.append(tweet)
                resultsAux.append(tweet)

                if receiveBuffer and len(resultsAux) >= bufferLength:
                    receiveBuffer(resultsAux)
                    resultsAux = []

                print len(results)
                if tweetCriteria.maxTweets > 0 and len(
                        results) >= tweetCriteria.maxTweets:
                    active = False
                    break

        if receiveBuffer and len(resultsAux) > 0:
            receiveBuffer(resultsAux)

        return results
예제 #44
0
def simple_package(package_name):
    ''' Given a package name, returns all the versions for downloading
    that package.

    If the package doesn't exists, then it will call PyPi (CheeseShop).
    But if the package exists in the local path, then it will get all
    the versions for the local package.

    This will take into account if the egg is private or if it is a normal
    egg that was uploaded to PyPi. This is important to take into account
    the version of the eggs. For example, a proyect requires request==1.0.4
    and another package uses request==1.0.3. Then the instalation of the
    second package will fail because it wasn't downloaded an the **request**
    folder only has the 1.0.4 version.

    To solve this problem, the system uses 2 different kinds of eggs:

    * private eggs: are the eggs that you uploaded to the private repo.
    * normal eggs: are the eggs that are downloaded from pypi.

    So the normal eggs will always get the simple page from the pypi repo,
    will the private eggs will always be read from the filesystem.


    :param package_name: the name of the egg package. This is only the
                          name of the package with the version or anything
                          else.

    :return: a template with all the links to download the packages.
    '''
    app.logger.debug('Requesting index for: %s', package_name)
    package_folder = get_package_path(package_name)
    if (is_private(package_name) or (
            exists(package_name) and app.config['SHOULD_USE_EXISTING'])):

        app.logger.debug('Found information of package: %s in local repository',
                         package_name)
        package_versions = []
        template_data = dict(
            source_letter=package_name[0],
            package_name=package_name,
            versions=package_versions
        )

        for filename in listdir(package_folder):
            if not filename.endswith('.md5'):
                # I only read .md5 files so I skip this egg (or tar,
                # or zip) file
                continue

            with open(join(package_folder, filename)) as md5_file:
                md5 = md5_file.read(-1)

            # remove .md5 extension
            name = filename[:-4]
            data = VersionData(name, md5)
            package_versions.append(data)

        return render_template('simple_package.html', **template_data)
    else:
        app.logger.debug('Didnt found package: %s in local repository. '
                         'Using proxy.', package_name)
        url = app.config['PYPI_URL'] + 'simple/%s' % package_name
        response = get(url)
        if response.status_code != 200:
            app.logger.warning('Error while getting proxy info for: %s'
                               'Errors details: %s', package_name,
                               response.text)
            abort(response.status_code)

        content = response.content
        p = PyQuery(content)
        external_links = set()
        for anchor in p("a"):
            panchor = PyQuery(anchor)
            href = panchor.attr('href')
            # robin-jarry: modified the href to ../../packages/
            # so that it works also for non-source packages (.egg, .exe and .msi)
            parsed = urlparse.urlparse(href)
            
            if parsed.hostname:
                # the link is to an external server.
                if parsed.hostname == 'pypi.python.org':
                    # we remove the hostname to make the URL relative
                    panchor.attr('href', parsed.path)
                else:
                    if panchor.attr('rel') == 'download':
                        if url_is_egg_file(parsed.path):
                            # href points to a filename
                            external_links.add('<a href="%s">%s</a>' % (href, basename(parsed.path)))
                        else:
                            # href points to an external page where we will find 
                            # links to package files
                            external_links.update(find_external_links(href))
                    # what ever happens, we remove the link for now
                    # we'll add the external_links after that we found after
                    panchor.remove()                    
            else:
                # local link to pypi.python.org
                if not href.startswith('../../packages/'):
                    # ignore anything else than package links
                    panchor.remove()
            
        # after collecting all external links, we insert them in the html page
        for link in external_links:
            plink = PyQuery(link)
            href = plink.attr('href')
            plink.attr('href', convert_to_internal_url(href, package_name, basename(href)))
            p('a').after(plink)
        
        content = p.outerHtml()
        return content
예제 #45
0
    def detail(self, url):
        try:
            resp = self.session.get(url, timeout=self.cfg.REQUEST_TIME_OUT)
            # resp = requests.get(url,headers=self.session.headers,timeout=self.cfg.REQUEST_TIME_OUT)
            # print self.session.headers
            # resp = requests.get(url,headers=self.session.headers,timeout=20)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            # print resp.headers

            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            Jtxt = pqhtml('script').text()

            #下架
            if 'productDetails' not in Jtxt:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            pdata = self.get_pdata(Jtxt)

            #前期准备
            product = pdata['product']
            allLooks = product['allLooks']
            skuJournal = self.get_skuJournal(Jtxt)
            sizeAttribute = product['sizeAttribute'] if product.has_key(
                'sizeAttribute') else {
                    'values': [{
                        'id': 0,
                        'value': self.cfg.DEFAULT_ONE_SIZE
                    }]
                }
            colorAttribute = product['colorAttribute'] if product.has_key(
                'colorAttribute') else {
                    'values': [{
                        'id': 0,
                        'value': self.cfg.DEFAULT_ONE_COLOR
                    }]
                }

            #lookId 和 SkuArr 映射
            # lookId2SkuArr = dict([(look['productLookId'],[Id['skuId'] for Id in look['skus']]) for look in allLooks])
            #lookId 和 ImgArr 映射
            lookId2ImgArr = dict([(look['productLookId'], [
                'http:' + img['retinaQuickViewLookUrl']
                for img in look['images']
            ]) for look in allLooks])
            #lookId 和 现价 映射, 多颜色多价格
            lookId2Price = dict([(look['productLookId'],
                                  look['pricing']['maxSkuSalePrice']['raw'])
                                 for look in allLooks])
            #lookId 和 原价 映射,多颜色多价格
            lookId2ListPrice = dict([
                (look['productLookId'],
                 look['pricing']['maxSkuMsrpPrice']['raw'])
                for look in allLooks
            ])
            #lookId 和 skuArr 映射
            lookId2SkuArr = dict([(look['productLookId'],
                                   [Id['skuId'] for Id in look['skus']])
                                  for look in allLooks])
            #sizeId 和 名称 映射  #{2000: u's', 2001: u'm', 2002: u'l', 2003: u'xl', 2004: u'xxl'}
            sizeId2Name = dict([(size['id'], size['value'])
                                for size in sizeAttribute['values']])
            #colorId 和 名称 映射   #{1000: u'dark red', 1001: u'true navy'}
            colorId2Name = dict([(color['id'], color['value'])
                                 for color in colorAttribute['values']])
            #sku 和 有库存 映射
            sku2Inventory = self.get_sku2Inventory(skuJournal)
            #sku 和 无库存 映射
            sku2NoInventory = dict([
                (sku['skuId'], sku['numberUnitsForSale'])
                for sku in skuJournal['entries']
                if sku['type'] == 'inventory' and sku['status'] == ['X', 'U']
            ])
            #更新 库存 字典
            sku2Inventory.update(sku2NoInventory)
            #sku 和 现价 映射, 多size多价格.
            sku2Price = dict([(sku['skuId'], str(sku['salePrice']['raw']))
                              for sku in skuJournal['entries']
                              if sku['type'] == 'pricing'])
            #sku 和 原价 映射, 多size多价格.
            sku2ListPrice = dict([(sku['skuId'], str(sku['msrpPrice']['raw']))
                                  for sku in skuJournal['entries']
                                  if sku['type'] == 'pricing'])
            #skuId 和 sizeId 映射
            skuId2SizeId = dict([
                (sku['skuId'], sku['savId']) for sku in skuJournal['entries']
                if sku['type'] == 'associate' and sku['attribute'] == 'Size'
            ])
            #skuId 和 colorId 映射
            skuId2ColorId = dict([
                (sku['skuId'], sku['savId']) for sku in skuJournal['entries']
                if sku['type'] == 'associate' and sku['attribute'] == 'Color'
            ])
            #sku 和 sizeName 映射
            sku2SizeName = self.get_sku2SizeName(product, skuId2SizeId,
                                                 sizeId2Name)
            #sku 和 colorName 映射
            sku2ColorName = self.get_sku2ColorName(product, skuId2ColorId,
                                                   colorId2Name)
            #lookId 和 colorId 映射
            lookId2ColorId = self.get_lookIe2ColorId(lookId2SkuArr,
                                                     skuId2ColorId)
            #lookId 和 colorName 映射
            lookId2ColorName = self.get_lookIe2ColorName(
                lookId2SkuArr, sku2ColorName)
            #lookId 和 size集合 映射
            lookId2Sizes = self.get_lookId2Sizes(lookId2SkuArr, sku2SizeName,
                                                 sku2Inventory, sku2Price,
                                                 sku2ListPrice)

            # print(json.dumps(sku2Price))
            # print(json.dumps(sku2ListPrice))
            # print(json.dumps(lookId2SkuArr))
            # print(json.dumps(sku2ColorName))
            # print(json.dumps(lookId2ColorName))
            # print(json.dumps(sku2SizeName))
            detail = dict()

            #只获取当前连接中的sku值
            try:
                lookId = None
                if '-' in url[url.rindex('/'):]:
                    lookId = url[url.rindex('/') + 1:].split('-')[0]
                    lookIds = [int(lookId)]
            except Exception, e:
                pass

            #钥匙
            detail['keys'] = lookId2SkuArr.keys()

            #只获取链接中lookId
            # detail['keys'] = lookIds or lookId2SkuArr.keys()

            #颜色
            detail['color'] = lookId2ColorName
            detail['colorId'] = lookId2ColorId

            #产品ID
            detail['productId'] = product['productId']

            #图片
            detail['img'] = dict([(lookId, imgArr[0])
                                  for lookId, imgArr in lookId2ImgArr.items()])
            detail['imgs'] = lookId2ImgArr

            #规格
            detail['sizes'] = lookId2Sizes

            #价格
            detail['price'] = lookId2Price
            detail['listPrice'] = lookId2ListPrice

            #品牌
            brand = pdata['brand']['name']
            detail['brand'] = brand

            #名称
            detail['name'] = brand + ' ' + pdata['product']['name']

            #货币符号
            currency = pdata['defaultLook']['pricing']['currencyCode']
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #退换货
            detail['returns'] = pdata['returnPolicy']['description']

            #描述
            dtxt = PyQuery(pdata['product']['description'])
            dtxt.remove('strong')
            detail['descr'] = dtxt.text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)
예제 #46
0
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  proxy=None):
        refreshCursor = ""

        # results = [] # ORIGINAL CODE LINE

        # MY MODIFICATION START
        results = {}
        # MY MODIFICATION END

        resultsAux = []
        cookieJar = http.cookiejar.CookieJar()

        active = True

        while active:
            json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
                                               cookieJar, proxy)
            if len(json["items_html"].strip()) == 0:
                break

            refreshCursor = json["min_position"]
            scrapedTweets = PyQuery(json["items_html"])
            # Remove incomplete tweets withheld by Twitter Guidelines
            scrapedTweets.remove("div.withheld-tweet")
            tweets = scrapedTweets("div.js-stream-tweet")

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweetPQ = PyQuery(tweetHTML)
                tweet = models.Tweet()

                # usernameTweet = tweetPQ("span.username.js-action-profile-name b").text()
                usernameTweet = tweetPQ("span:first.username.u-dir b").text()
                txt = re.sub(
                    r"\s+",
                    " ",
                    tweetPQ("p.js-tweet-text").text().replace("# ",
                                                              "#").replace(
                                                                  "@ ", "@"),
                )

                # NEW CODE START
                txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text())
                txt = re.sub(r"#\s*", "#", txt)
                txt = re.sub(r"@\s*", "@", txt)
                # NEW CODE END

                retweets = int(
                    tweetPQ(
                        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                favorites = int(
                    tweetPQ(
                        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                dateSec = int(
                    tweetPQ("small.time span.js-short-timestamp").attr(
                        "data-time"))
                id = tweetPQ.attr("data-tweet-id")
                permalink = tweetPQ.attr("data-permalink-path")
                user_id = int(
                    tweetPQ("a.js-user-profile-link").attr("data-user-id"))

                geo = ""
                geoSpan = tweetPQ("span.Tweet-geo")
                if len(geoSpan) > 0:
                    geo = geoSpan.attr("title")
                urls = []
                for link in tweetPQ("a"):
                    try:
                        urls.append((link.attrib["data-expanded-url"]))
                    except KeyError:
                        pass
                tweet.id = id
                tweet.permalink = "https://twitter.com" + permalink
                tweet.username = usernameTweet

                tweet.text = txt
                tweet.date = datetime.datetime.fromtimestamp(dateSec)
                tweet.formatted_date = datetime.datetime.fromtimestamp(
                    dateSec).strftime("%a %b %d %X +0000 %Y")
                tweet.retweets = retweets
                tweet.favorites = favorites
                # tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) #OLD
                tweet.mentions = " ".join(
                    re.compile(r"(@\s\w*)").findall(tweet.text))  # NEW
                # tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) # OLD
                tweet.hashtags = " ".join(
                    re.compile(r"(#\s\w*)").findall(tweet.text))  # NEW
                tweet.geo = geo
                tweet.urls = ",".join(urls)
                tweet.author_id = user_id

                # tweet.replies = int(tweetPQ("span.ProfileTweet-action--reply span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "").strip())
                replies = int(
                    tweetPQ(
                        "span.ProfileTweet-action--reply span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                tweet.reply = replies

                # NEW CODE START
                try:
                    tweet.isReply = tweetPQ(
                        "div.ReplyingToContextBelowAuthor").is_("div")
                    if tweet.isReply:
                        tweet.replyTo = tweetPQ(
                            "div.ReplyingToContextBelowAuthor span.username b"
                        ).contents()[0]
                    else:
                        tweet.replyTo = ""
                except:
                    pass
                # NEW CODE END

                # results.append(tweet) # ORIGINAL CODE LINE

                results[id] = [
                    tweet.geo,
                    tweet.author_id,
                    tweet.date,
                    tweet.text,
                    tweet.retweets,
                    tweet.favorites,
                    tweet.mentions,
                    tweet.hashtags,
                    tweet.permalink,
                    tweet.reply,
                    tweet.isReply,
                    tweet.replyTo,
                ]

                resultsAux.append(tweet)

                if receiveBuffer and len(resultsAux) >= bufferLength:
                    receiveBuffer(resultsAux)
                    resultsAux = []

                if (tweetCriteria.maxTweets > 0
                        and len(results) >= tweetCriteria.maxTweets):
                    active = False
                    break

        if receiveBuffer and len(resultsAux) > 0:
            receiveBuffer(resultsAux)

        return results
예제 #47
0
    def extract(self):
        item = ContentItem()

        self.html = re.sub('<!--.*?-->', '', self.html)
        content_node = self.hxs.select("//div[@class = 'art_con']").extract()
        content_node = PyQuery(content_node[0])
        
        content_node.remove('div[class = "pconline_page"]')
        content_node.remove('div[class = "pc3g"]')
        content_node.remove('div[class = "pageTips"]')
        content_node.remove('div[class = "art_nav_box mt10"]')
        content_node.remove('div[class = "art_bottom"]')
        content_node.remove('div[class = "art_con_top"]')

        

        item['image_urls'] = [self.getRealURI(img.get('src')) for img in content_node('img') if not img.get('src').endswith('.gif')]
        item['title'] = self.title = self.hxs.select("//h1/text()").extract()[0]
        if not item['title']:
            item['title'] = self.title = self.hxs.select("//div[@id = 'UC_newsInfoDetail_lbl_newsTitle']/text()").extract()[0]
        item['content'] = self.content = content_node.__unicode__()
        release_time = self.hxs.select("//div[@class = 'art_con_top']").extract()[0]
        doc_t = PyQuery(release_time)
        release_time = doc_t('span').text()
        p = re.compile(u'20\d\d年\d\d月\d\d日')
        #item['release_time'] = self.release_time = doc('div[class="art_con_top"]').find('span').eq(0).text()
        item['release_time'] = self.release_time = p.search(release_time).group()
        item['source'] = u'pconline'
        item['author'] = ''
        item['pic_url'] = ''

        return item
예제 #48
0
class HTMLGenerator(object):
    """HTML Generator
    """

    def __init__(self):
        self.MAX_WORKERS = 4
        self.MULTIPROCESS_BOUND = 20

    def load_tree_template(self):
        """Load tree HTML templates
        """
        with open(os.path.join(os.path.dirname(__file__), 'template', 'tree_template.html')) as f:
            self.template = PyQuery(f.read(), parser='html')
        with open(os.path.join(os.path.dirname(__file__), 'template', 'tree_node_template.html')) as f:
            self.node_template = PyQuery(f.read(), parser='html')
            self.node_template_html = self.node_template.html()

    def import_js(self, js_ids):
        """Import JS to HTML
        :param js_ids: dict type, {script_id with #: js_file_name}
                        exmaple: {"#script_jquery": "jquery.min.js"}
        """
        _path = os.path.dirname(__file__)

        for _id in js_ids.iterkeys():
            self.template(_id).attr("src", "%s/bin/js/%s" % (_path, js_ids[_id]))
            # In case that lxml change <script></script> to <script/>
            self.template(_id).html("var _lxml = 0;")

    def generate_tree_structure_HTML(self, root_node, output):
        """Generate a html file with tree structure.
        :param root_node: RDirNode root of the module
        :param output: Output html file
        """

        # Init
        self.load_tree_template()
        self.tree_nodes = []
        self.max_layer = 0

        self.import_js({
            # script_id : js_file_name
            "#script_jquery": "jquery.min.js",
            "#script_rdir_tree": "rdir_tree.js"
        })
        self.template('#header_name').html(root_node.name)
        self.template('#header_type').html(" &lt;%s&gt;" % root_node.type)

        header_doc = root_node.doc.replace('\t', '&nbsp;' * 4) \
            .replace(' ', '&nbsp;').replace('\n', '<br/>').strip()
        if len(header_doc) > 0:
            self.template('#header_doc').html(header_doc + '<br/>')
        else:
            self.template.remove('#header_doc')
        self.template('title').html(root_node.name)

        # Recur
        if len(root_node.list_children()) == 0:
            # self._add_node_to_HTML("No visible children methods or members.",
            #                        "If you see this, that means this object has nothing else to show.",
            #                        "404",
            #                        0)
            pass
        else:
            self.render_tree_html(root_node)


        # Render html
        for i in xrange(self.max_layer + 1):
            self.template("#choose_layer").append(
                "<option value='%d'>%d</option>" % (i, i)
            )

        self.template('#wrapper').append("\n".join(self.tree_nodes))

        # Write to file
        with open(output, 'w') as f:
            f.write(self.template.html())


    def render_tree_html(self, root_node):
        """ Render the node html. Use multiprocessing to speed up if needed.
        :param root_node: RDirNode root of the module
        """
        job_list = self.get_job_list(root_node)
        job_size = len(job_list)

        if job_size > self.MULTIPROCESS_BOUND:
            jobs_list = Util.split_jobs(job_list, self.MAX_WORKERS)
        else:
            jobs_list = [job_list]
        pool = multiprocessing.Pool(processes=self.MAX_WORKERS)

        result = []
        html = self.node_template.html()
        for jobs in jobs_list:
            if len(jobs) > 0:
                result.append(pool.apply_async(parse_tree_node_worker, (html, jobs)))

        # pool.close()
        # pool.join()

        self.tree_nodes = [None] * job_size
        for res in result:
            res = res.get()
            for tpl in res:
                index, node_html = tpl
                self.tree_nodes[index] = node_html


    def get_job_list(self, root_node):
        """Generate the job list
        :param root_node: RdirNode type, root of rdir_node
        :return: list type, [(index, rdir_node, depth)]
        """
        job_list = []
        for key in root_node.list_children():
            job_list += self.recur_node_to_list(root_node.get_children(key), 0)
        return [(index, job[0], job[1]) for index, job in enumerate(job_list)]

    def recur_node_to_list(self, rdir_node, depth):
        """Recursively traverse all the nodes into a sequential list.
        :param rdir_node:
        :param depth:
        :return: list type, [(rdir_node, depth)]
        """
        self.max_layer = (self.max_layer < depth) and depth or self.max_layer
        _list = [(rdir_node, depth)]
        for key in rdir_node.list_children():
            _list += self.recur_node_to_list(rdir_node.get_children(key), depth + 1)
        return _list
from pyquery import PyQuery

assert len(sys.argv) == 2, "Second argument is the notebook name!"
NOTEBOOK = sys.argv[1]

parts = NOTEBOOK.split('.')
parts[-1] = "html"
HTML_FILE = ".".join(parts)

# Gather the information from the first cell.
with open(NOTEBOOK) as f:
    res = json.load(f)
blocks = json.loads("".join(res['cells'][0]['source']))

# Convert the notebook. 
call(['ipython', 'nbconvert', NOTEBOOK, '--to', 'html', '--template', 'basic'])

# Remove input cells.
with open(HTML_FILE) as f:
    doc = PyQuery(f.read(), parser='html')
    doc.remove('.input')
    blocks['body'] = doc.html()

# Insert into simple template. 
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(BASE_DIR, 'my_template.html')) as f:
    tmpl = f.read()
template = Template(tmpl)

with open(HTML_FILE, 'w') as f:
    f.write(template.render(**blocks))
예제 #50
0
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  proxy=None):
        refreshCursor = ''

        results = []
        resultsAux = []
        cookieJar = http.cookiejar.CookieJar()

        active = True

        while active:
            json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
                                               cookieJar, proxy)
            if len(json['items_html'].strip()) == 0:
                break

            refreshCursor = json['min_position']
            scrapedTweets = PyQuery(json['items_html'])
            #Remove incomplete tweets withheld by Twitter Guidelines
            scrapedTweets.remove('div.withheld-tweet')
            tweets = scrapedTweets('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweetPQ = PyQuery(tweetHTML)
                tweet = models.Tweet()
                usernameTweet = tweetPQ(
                    "span.username.js-action-profile-name b").text()
                raw_txt = re.sub(r"\s+", " ",
                                 tweetPQ("p.js-tweet-text").text())
                txt = raw_txt.replace('#', '# ').replace('@', '@ ')
                retweets = int(
                    tweetPQ(
                        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                favorites = int(
                    tweetPQ(
                        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                dateSec = int(
                    tweetPQ("small.time span.js-short-timestamp").attr(
                        "data-time"))
                id = tweetPQ.attr("data-tweet-id")
                permalink = tweetPQ.attr("data-permalink-path")
                user_id = int(
                    tweetPQ("a.js-user-profile-link").attr("data-user-id"))

                geo = ''
                geoSpan = tweetPQ('span.Tweet-geo')
                if len(geoSpan) > 0:
                    geo = geoSpan.attr('title')
                urls = []
                for link in tweetPQ("a"):
                    try:
                        urls.append((link.attrib["data-expanded-url"]))
                    except KeyError:
                        pass
                tweet.id = id
                tweet.permalink = 'https://twitter.com' + permalink
                tweet.username = usernameTweet

                tweet.raw_txt = raw_txt
                tweet.text = txt
                tweet.date = datetime.datetime.fromtimestamp(dateSec)
                tweet.formatted_date = datetime.datetime.fromtimestamp(
                    dateSec).strftime("%a %b %d %X +0000 %Y")
                tweet.retweets = retweets
                tweet.favorites = favorites
                tweet.mentions = " ".join(
                    re.compile('@ \\S+').findall(raw_txt))
                tweet.hashtags = " ".join(
                    re.compile('# \\S+').findall(raw_txt))
                tweet.geo = geo
                tweet.urls = ",".join(urls)
                tweet.author_id = user_id

                results.append(tweet)
                resultsAux.append(tweet)

                if receiveBuffer and len(resultsAux) >= bufferLength:
                    receiveBuffer(resultsAux)
                    resultsAux = []

                if tweetCriteria.maxTweets > 0 and len(
                        results) >= tweetCriteria.maxTweets:
                    active = False
                    break

        if receiveBuffer and len(resultsAux) > 0:
            receiveBuffer(resultsAux)

        return results
예제 #51
0
import os
from pyquery import PyQuery

in_dir = '/home/jental/dev/eda_ru/eda.ru/'
# in_dir = '/home/jental/dev/eda_ru/tmp/'

for cdir, dirs, files in os.walk(in_dir):
  for file in files:
    full_filename_b = os.path.join(cdir, file).encode("utf-8", "surrogateescape")
    try:
      full_filename = full_filename_b.decode("utf-8")
      print(full_filename)
      if full_filename.endswith(('.html', '.htm')):
        with open(full_filename, 'r') as fh:
          html = fh.read()

          jQuery = PyQuery(html)
          jQuery.remove('.ad-link')
        with open(full_filename, 'w') as fh:
          fh.write(jQuery("html").html())
    except:
      pass