Exemplo n.º 1
0
def get_data_list(URL, file_type=""):
    SUMMARY_SENTENCES_COUNT = 5
    sentences = []
    try:
        LANGUAGE = "english"
        # parser = None
        if file_type == "txt":
            parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE))
        elif file_type == "pdf":
            content = read_pdf(URL)
            parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE))
        else:
            parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE))

        document = parser.document
        stemmer = Stemmer(LANGUAGE)

        from sumy.summarizers.luhn import LuhnSummarizer

        LHS = LuhnSummarizer(stemmer)
        LHS.stop_words = get_stop_words(LANGUAGE)
        print("\nSummary using Luhn Summarizer")
        print("*******************************")
        for sentence in LHS(document, SUMMARY_SENTENCES_COUNT):
            sentences.append(str(sentence))
    except Exception as e:
        print(str(e))
    finally:
        return sentences
Exemplo n.º 2
0
def get_doc_summary(html, url):
    '''
    Parse document text and extract summary with summarization 
    algorithms. This is helpful when meta-desc tag is not available
    '''
    from sumy.parsers.html import HtmlParser
    # from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words

    LANGUAGE = "english"
    SENTENCES_COUNT = 3

    parser = HtmlParser.from_string(html, url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    res = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        res += str(sentence)
    return res
Exemplo n.º 3
0
def auto_summarize_comment(request):
    
    comment_ids = request.POST.getlist('d_ids[]')
    
    sent_list = []
    
    for comment_id in comment_ids:
        comment = Comment.objects.get(id=comment_id)
        text = comment.text
        
        text = re.sub('<br>', ' ', text)
        text = re.sub('<BR>', ' ', text)
        
        parser = HtmlParser.from_string(text, '', Tokenizer("english"))
        
        num_sents = request.GET.get('num_sents', None)
        if not num_sents:
            all_sents = parser.tokenize_sentences(text)
            num_sents = floor(float(len(all_sents))/3.0)
        
        sents = summarizer(parser.document, num_sents)
         
        
        for sent in sents:
            sent_list.append(sent._text)
     
    return JsonResponse({"sents": sent_list})
 def get_summary(self, summary_length: int = 10) -> Iterator[str]:
     parser = HtmlParser.from_string(self.content, Tokenizer(LANGUAGE),
                                     self.link)
     stemmer = Stemmer(LANGUAGE)
     summarizer = Summarizer(stemmer)
     summarizer.stop_words = get_stop_words(LANGUAGE)
     for sentence in summarizer(parser.document, summary_length):
         yield sentence
Exemplo n.º 5
0
def __init__():
    LANGUAGE = "english"
    SENTENCES_COUNT = 1


    stemmer = Stemmer(LANGUAGE)

    lsaSummarizer = Lsa(stemmer)
    lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
    luhnSummarizer = Luhn(stemmer)
    luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
    # edmundsonSummarizer.bonus_words = get_bonus_words

    lexrankSummarizer = LexRank(stemmer)
    lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    textrankSummarizer = TxtRank(stemmer)
    textrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    sumbasicSummarizer = SumBasic(stemmer)
    sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)


    klSummarizer = KL(stemmer)
    klSummarizer.stop_words = get_stop_words(LANGUAGE)

    parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE))

    allvariations = []

    for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
       # print("Summarizing text via LSA: ")
        print((str(sentence)))


        allvariations.append(sentence)
    for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Luhn: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Lexrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Textrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Sumbasic: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in klSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via klSum: ")
        print(str(sentence))
        allvariations.append(sentence)
        return allvariations
Exemplo n.º 6
0
def get_summary(html):
    parser = HtmlParser.from_string(html, tokenizer=Tokenizer(LANGUAGE), url=None)
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Exemplo n.º 7
0
def summarize(doc, SENTENCES_COUNT):
    parser = HtmlParser.from_string(doc, None, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    summary = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        if str(sentence).strip().startswith("Image copyright") is False:
            summary += (" " + str(sentence))
    return summary
Exemplo n.º 8
0
 def summarize(self, summarizer_type, max_sentences):
     # TextRank
     if summarizer_type == "textrank":
         self.result_list = summarize(self.document, ratio=0.3, word_count=None, split=True)
     #PyTextRank
     elif summarizer_type == "lsa":
         parser = HtmlParser.from_string(self.document, None,tokenizer=Tokenizer("english"))
         stemmer = Stemmer("english")
         summarizer = summarizers.lsa.LsaSummarizer(stemmer)
         summarizer.stop_words = get_stop_words("english")
         summarized_sentence_list = summarizer(parser.document, max_sentences)
         self.result_list = [str(sentence) for sentence in summarized_sentence_list]
Exemplo n.º 9
0
def summarize(method, length, url):
    html_content = fetch_url(url)
    iso_lang = detect_language(html_content)
    language = SUMY_LANGUAGES[iso_lang]
    stemmer = Stemmer(language)
    parser = HtmlParser.from_string(html_content, url, Tokenizer(language))

    summarizer_class = AVAILABLE_METHODS[method]
    summarizer = build_summarizer(summarizer_class, get_stop_words(language), stemmer, parser)

    sentences = summarizer(parser.document, ItemsCount(length))
    summary = ' '.join([unicode(sentence) for sentence in sentences])
    return summary, iso_lang
Exemplo n.º 10
0
def summarize(method, length, url):
    html_content = fetch_url(url)
    iso_lang = detect_language(html_content)
    language = SUMY_LANGUAGES[iso_lang]
    stemmer = Stemmer(language)
    parser = HtmlParser.from_string(html_content, url, Tokenizer(language))

    summarizer_class = AVAILABLE_METHODS[method]
    summarizer = build_summarizer(summarizer_class, get_stop_words(language),
                                  stemmer, parser)

    sentences = summarizer(parser.document, ItemsCount(length))
    summary = ' '.join([unicode(sentence) for sentence in sentences])
    return summary, iso_lang
Exemplo n.º 11
0
def do():
    rows = store.get_row_by_status(1)

    for row in rows:
        parser = HtmlParser.from_string(row["content_origin"], row["url"], Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences = list()

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            sentences.append(str(sentence))

        summary = "\n".join(sentences)

        store.update_row(row["id"], {"summary_origin": summary, "status": 2})
Exemplo n.º 12
0
def do():
    rows = store.get_row_by_status(1)

    for row in rows:
        parser = HtmlParser.from_string(row["content_origin"], row["url"],
                                        Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences = list()

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            sentences.append(str(sentence))

        summary = "\n".join(sentences)

        store.update_row(row["id"], {"summary_origin": summary, "status": 2})
Exemplo n.º 13
0
def get_summary(xhtml):
    summary_algorithm = TextSummarizer
    LANGUAGE = "english"
    REVIEW_COUNT = 20
    # SENTENCES_COUNT = 30

    parser = HtmlParser.from_string(xhtml, None, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = summary_algorithm(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    summaries = []
    for sentence in summarizer(parser.document, REVIEW_COUNT):
        sentence = str(sentence).strip()
        if sentence not in summaries and '?' not in sentence:
            summaries.append(sentence)

    return summaries
Exemplo n.º 14
0
    def _get_summary(self):
        if self.readable == '':
            return

        language = self.language.lower()
        if language == '':
            language = 'english'

        parser = HtmlParser.from_string(
            self.readable, self.url, Tokenizer(language))
        stemmer = Stemmer(language)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(language)
        summary = []
        for sentence in summarizer(parser.document, 10):
            if sentence.is_heading:
                summary.append('<h2>%s</h2>' % (unicode(sentence)))
            else:
                summary.append('<p>%s</p>' % (unicode(sentence)))

        self.summary = ''.join(summary)
Exemplo n.º 15
0
def summerize_text(text):
    text = text.replace("#", " ").replace("\n", " ")
    parser = HtmlParser.from_string(text, "https://www.topsocial.com",
                                    Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentences = summarizer(parser.document, SENTENCES_COUNT)
    return sentences[0] if len(sentences) > 0 else None
    # for sentence in summarizer(parser.document, SENTENCES_COUNT):
    #     print(sentence)


# summerize_text("""با این حال در روزهاى پایانى فصل و با انتشار دوباره شایعه مذاکره اش با پرسپولیس، منصوریان رویه تازه اى براى این هافبک دفاعى در نظر گرفت و در اولین گام نامش را از لیست جدال با الاهلى خط زد. مساله اى که باعث شد تا او به حالت قهر در دو تمرین بعد از این بازى غایب باشد. منصوریان در سفر به مشهد و براى دیدار با پدیده هم بار دیگر نام این بازیکن را از لیست تیمش خط زد تا باقرى و جدایى از پیراهن استقلال به اپیزود پایانى خود نزدیک شود.
#
# از آنجایی که برانکو هم در پایان فصل پیش و هم در نقل و انتقالات نیم فصل علاقه خود به جذب این بازیکن را نشان داده بود و در نهایت ناکام مانده بود، حالا مرد کروات امیدوار شده تا سومین تیرش در جذب باقری به هدف بخورد و پیراهن پرسپولیس را به هافبک استقلال هدیه بدهد. اتفاقی که به زودی رخ خواهد داد.
#  """)
Exemplo n.º 16
0
def auto_summarize_comment(request):
    
    from sumy.nlp.stemmers import Stemmer
    #from sumy.utils import get_stop_words
    from sumy.parsers.html import HtmlParser
    from sumy.nlp.tokenizers import Tokenizer
    #from sumy.summarizers.lsa import LsaSummarizer as Summarizer
    #from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
    from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
         
    stemmer = Stemmer("english")
    summarizer = Summarizer(stemmer)
    
    comment_ids = request.POST.getlist('d_ids[]')
    
    sent_list = []
    
    for comment_id in comment_ids:
        comment = Comment.objects.get(id=comment_id)
        text = comment.text
        
        text = re.sub('<br>', ' ', text)
        text = re.sub('<BR>', ' ', text)
        
        parser = HtmlParser.from_string(text, '', Tokenizer("english"))
        
        num_sents = request.GET.get('num_sents', None)
        if not num_sents:
            all_sents = parser.tokenize_sentences(text)
            num_sents = floor(float(len(all_sents))/3.0)
        
        sents = summarizer(parser.document, num_sents)
         
        
        for sent in sents:
            sent_list.append(sent._text)
     
    return JsonResponse({"sents": sent_list})
Exemplo n.º 17
0
 def summarize(self, summarizer_type, max_sentences, document = ""):
     if self.document == "":
         target_document = document
     else:
         target_document = self.document
     # Spacing
     _target_document = ""
     sentence_list = self.pro.sentence_splitter(target_document)
     for sentence in sentence_list:
         _target_document += sentence + " "
     _target_document = _target_document.strip()
     # TextRank
     if summarizer_type == "textrank":
         self.result_list = summarize(_target_document, ratio=0.3, word_count=None, split=True)[:max_sentences]
     # PyTextRank
     elif summarizer_type == "lsa":
         parser = HtmlParser.from_string(_target_document, None,tokenizer=Tokenizer("english"))
         stemmer = Stemmer("english")
         summarizer = LsaSummarizer(stemmer)
         summarizer.stop_words = get_stop_words("english")
         summarized_sentence_list = summarizer(parser.document, max_sentences)
         self.result_list = [str(sentence) for sentence in summarized_sentence_list]
     return self.result_list
Exemplo n.º 18
0
def get_summary(article, url=False, num_sentence=NUM_SUMMARY_SENTENCE):
    """
    get the summary of one article
    :param num_sentence: number of sentence left for summary
    :param article: html string of the article or the url of the article
    :param url: True is article is an url
    :return: the summary of the article as string
    """
    if url:
        parser = HtmlParser.from_url(article, tokenizer=Tokenizer(LANGUAGE))
    else:
        parser = HtmlParser.from_string(article,
                                        tokenizer=Tokenizer(LANGUAGE),
                                        url=None)
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    summ_sents = summarizer(parser.document, num_sentence)
    summary = " ".join([str(s).strip() for s in summ_sents])

    return summary
Exemplo n.º 19
0
    def do_work(self, worker_id, work):
        url = work
        """Greenlet to fetch analyze URL content
        """
        print '[+] {0}: Starting crawl of {1}'.format(worker_id, url)

        """Using urllib2 via geventhttpclient. Selenium with 
        PhantomJS or a real browser would be probably better
        but slower and more expensive. Could have also used
        scrapy, but thats way to heavy for this use-case."""
        body = urlopen(url).read()

        """Using Sumy (built on nltk) for page summaries since
        it supports a number of ranking algorithms. It's not
        perfect though, it was written for czech and so its 
        missing some important English-specific things (e.g.
        bonus/significant words for Edmundson Summarizers)

        https://pypi.python.org/pypi/sumy

        TextBlob might be a better alternative, but it didn't
        seem to provide overall summary information. 

        https://textblob.readthedocs.org/en/latest/
        """
        parser = HtmlParser.from_string(body, None, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        words = []
        for sentence in summarizer(parser.document, 10):
            words = str(sentence).split()

        # Send the results
        self.work_done(worker_id, words)
Exemplo n.º 20
0
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
import pkuseg
import math
LANGUAGE = "chinese"
SENTENCES_COUNT = 3

if __name__ == "__main__":
    seg = pkuseg.pkuseg(postag=True)

    for i, line in enumerate(open("data/content_new_2.txt").readlines()):
        parser = HtmlParser.from_string(line,
                                        tokenizer=Tokenizer(LANGUAGE),
                                        url=None)
        sent_score = []
        for j, sent in enumerate(parser.document.sentences):
            text = sent._text
            segtext = seg.cut(text)
            nnum = len(list(filter(lambda x: x[1] == 'n', segtext)))
            rnum = len(list(filter(lambda x: x[0] == '本院', segtext)))
            r2num = len(list(filter(lambda x: x[0] == '认为', segtext)))
            lnum = math.log(len(segtext), 10)
            sent_score.append(
                (text,
                 (nnum + rnum * 7 + r2num * 5) * 1.0 / len(segtext) + lnum, j,
                 segtext))

        num_sentences = int(max(min(5,
                                    len(parser.document.sentences) / 15), 3))
        sent_score.sort(key=lambda x: x[1], reverse=True)
        sent_idx = [(text, idx) for text, radio, idx, segtext in sent_score
Exemplo n.º 21
0
CONFIG_FILE = environ["HOME"] + "/.cloudfeed"
CONFIG = {'last_pub': 0}
LANGUAGE = 'english'

if path.exists(CONFIG_FILE):
    with open(CONFIG_FILE, mode="r") as f:
        CONFIG.update(json.load(f))

feed = CloudFeed(db=CONFIG['database'])
mastodon = Mastodon(client_id=CONFIG['client_id'],
                    client_secret=CONFIG['client_secret'],
                    access_token=CONFIG['access_token'],
                    api_base_url=CONFIG['mastodon_url'])
summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
summarizer.stop_words = get_stop_words(LANGUAGE)

new_posts = feed.get_posts(since=CONFIG['last_pub'])
if len(new_posts) > 0:
    CONFIG["last_pub"] = max([post["pub_date"] for post in new_posts])
    for post in new_posts:
        summary = summarizer(
            HtmlParser.from_string(post['body'], post['url'],
                                   Tokenizer("english")).document, 1)
        post["summary"] = summary[0]
        message = "{feed}: {title}\n{url}\n\n{summary}".format(**post)
        mastodon.toot(message)

with open(CONFIG_FILE, mode="w+") as f:
    json.dump(CONFIG, f)
Exemplo n.º 22
0
def crawl():
    print(datetime.now())
    cursor = conn.cursor()
    cursor.execute("SET NAMES utf8mb4")
    cursor.execute('select id, name, feedUrl, lang, form from sources')
    sources = cursor.fetchall()
    start = time.clock()
    for source in sources:
        # if source['id']%30 == datetime.now().minute%30:
        print(source[0])
        source = {
            'id': source[0],
            'name': source[1],
            'feedUrl': source[2].replace("39.105.127.55", "127.0.0.1"),
            'lang': source[3],
            'form': source[4]
        }
        print(source['name'])
        LANGUAGE = 'chinese'
        if source['lang'] == 2:
            LANGUAGE = 'english'
        items = feedparser.parse(source['feedUrl'])['items']
        for item in items:
            try:
                cursor.execute('select 1 from entries where link = %s limit 1',
                               (item['link'], ))
                results = cursor.fetchall()
                if (not results) or (len(results) == 0):
                    try:
                        entry = {
                            'title':
                            item['title'],
                            'link':
                            item['link'],
                            'source_id':
                            source['id'],
                            'source_name':
                            source['name'],
                            'time':
                            '',
                            'crawl_time':
                            datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                            'photo':
                            '',
                            'lang':
                            1,
                            'author':
                            '',
                            'description':
                            '',
                            'digest':
                            '',
                            'content':
                            '',
                            'cluster':
                            0,
                            'sim_count':
                            0,
                            'simhash':
                            '0',
                            'cate11':
                            '',
                            'cate12':
                            '',
                            'cate13':
                            '',
                            'cate21':
                            '',
                            'cate22':
                            '',
                            'cate23':
                            '',
                            'tag1':
                            '',
                            'tag2':
                            '',
                            'tag3':
                            '',
                            'tag4':
                            '',
                            'tag5':
                            '',
                            'video':
                            '',
                            'video_frame':
                            '',
                            'audio':
                            '',
                            'audio_frame':
                            ''
                        }
                        cate1 = ['', '', '']
                        cate2 = ['', '', '']
                        tag = ['', '', '', '', '']
                        ############ Additonal Settings for special sources ##############
                        if entry['source_name'] == 'Hacker News':
                            entry['link'] = item['comments']
                        ###########################

                        if is_en(entry['title']):
                            entry['lang'] = 2
                        if 'published_parsed' in item:
                            try:
                                entry['time'] = datetime.fromtimestamp(
                                    mktime(
                                        item['published_parsed'])) + timedelta(
                                            hours=TZ_DELTA)
                            except Exception as e:
                                entry['time'] = entry['crawl_time']
                                print('Exception when published_parsed: {}'.
                                      format(e))
                        else:
                            entry['time'] = entry['crawl_time']

                        if 'author' in item:
                            entry['author'] = item['author'][0:20]

                        if 'summary' in item:
                            entry['description'] = item['summary'][0:500]

                        if 'content' in item:
                            entry['content'] = item['content'][0]['value'][
                                0:15000]
                        if entry['content'] == '' and 'summary' in item and len(
                                item['summary']) > 0:
                            entry['content'] = item['summary'][0:15000]
                        for field in item['links']:
                            if field['type'] == 'audio/mpeg':
                                if field['href'].endswith('.mp3'):
                                    entry['audio'] = field['href']
                                if field['href'].endswith('.mp4'):
                                    entry['video'] = field['href']

                        #对于文章类entry才进行摘要、聚类、分类、标签
                        if source['form'] == 1:
                            try:
                                if entry['content'] != '':
                                    entry['photo'] = getImg(entry['content'])
                                    if len(entry['photo']) > 255:
                                        entry['photo'] = ''

                                    parser = HtmlParser.from_string(
                                        entry['content'], "",
                                        Tokenizer(LANGUAGE))
                                    stemmer = Stemmer(LANGUAGE)
                                    summarizer = Summarizer(stemmer)
                                    summarizer.stop_words = get_stop_words(
                                        LANGUAGE)
                                    for sentence in summarizer(
                                            parser.document, SENTENCES_COUNT):
                                        entry['digest'] += str(sentence)
                                        if len(entry['digest']) >= 500:
                                            break
                                else:
                                    parser = HtmlParser.from_url(
                                        entry['link'], Tokenizer(LANGUAGE))
                                    stemmer = Stemmer(LANGUAGE)
                                    summarizer = Summarizer(stemmer)
                                    summarizer.stop_words = get_stop_words(
                                        LANGUAGE)
                                    for sentence in summarizer(
                                            parser.document, SENTENCES_COUNT):
                                        entry['digest'] += str(sentence)
                                        if len(entry['digest']) >= 500:
                                            break
                                entry['digest'] = entry['digest'][0:500]
                            except Exception as e:
                                print(
                                    'Exception when getting digest: {}'.format(
                                        e))

                            features = get_features(entry['title'],
                                                    entry['content'])
                            try:
                                entry['simhash'] = str(Simhash(features).value)
                                nears = index.get_near_dups(Simhash(features))
                                if len(nears) > 0:
                                    entry['sim_count'] = len(nears)
                                    cursor.execute(
                                        'select cluster from entries where id = %s',
                                        (int(nears[0]), ))
                                    near_cluster = cursor.fetchone()[0]
                                    entry['cluster'] = near_cluster
                                else:
                                    global last_cluster_num
                                    entry['cluster'] = last_cluster_num
                                    last_cluster_num += 1
                            except Exception as e:
                                print(
                                    'Exception when clustering: {}'.format(e))

                            try:
                                content2 = BeautifulSoup(
                                    entry['content'], "lxml").text.encode(
                                        'gbk', 'ignore').decode(
                                            'gbk')[0:AIP_MAX_LEN_CONTENT]
                                if len(content2) == 0:
                                    if len(entry['digest']) > 0:
                                        content2 = entry['digest']
                                title2 = entry['title'][0:AIP_MAX_LEN_TITLE]
                                keywords = client.keyword(title2, content2)
                                topics = client.topic(title2, content2)
                                i = 0
                                for item in topics['item']['lv1_tag_list']:
                                    cate1[i] = item['tag']
                                    i += 1
                                    if i > 2:
                                        break
                                i = 0
                                for item in topics['item']['lv2_tag_list']:
                                    cate2[i] = item['tag']
                                    i += 1
                                    if i > 2:
                                        break
                                i = 0
                                for item in keywords['items']:
                                    tag[i] = item['tag']
                                    i += 1
                                    if i > 4:
                                        break
                                entry['cate11'] = cate1[0]
                                entry['cate12'] = cate1[1]
                                entry['cate13'] = cate1[2]
                                entry['cate21'] = cate2[0]
                                entry['cate22'] = cate2[1]
                                entry['cate23'] = cate2[2]
                                entry['tag1'] = tag[0]
                                entry['tag2'] = tag[1]
                                entry['tag3'] = tag[2]
                                entry['tag4'] = tag[3]
                                entry['tag5'] = tag[4]
                            except Exception as e:
                                print(
                                    'Exception when categorizing and tagging: {}'
                                    .format(e))

                        elif source['form'] == 2:
                            entry['photo'] = getWeiboImg(entry['content'])
                            entry['digest'] = filterWeiboTags(entry['content'])
                            if len(entry['digest']) > 500:
                                entry['digest'] = entry['digest'][0:500]

                        elif source['form'] == 4:
                            if entry['link'].startswith(
                                    'https://www.bilibili.com/video'):
                                entry['video_frame'] = 'http://player.bilibili.com/player.html?aid=' + \
                                    entry['link'][33:]

                        try:
                            cursor.execute(add_entry, entry)
                            conn.commit()
                            index.add(str(cursor.lastrowid), Simhash(features))
                        except Exception as e:
                            print('Exception when add entry: {}'.format(e))
                    except Exception as e:
                        print("Unexpected Error: {}".format(e))
            except Exception as e:
                print("Unexpected Error: {}".format(e))
        # print(d['feed']['title'])
    elapsed = time.clock() - start
    print('time used: ' + str(elapsed))

    # 关闭Cursor和Connection:
    cursor.close()
Exemplo n.º 23
0
def download_sources(summarize=True, sources=currentFeeds):
    raw_documents = []
    complete_urls = []

    # Download News Stories
    converter = html2text.HTML2Text()
    converter.ignore_links = True
    converter.ignore_images = True
    converter.bypass_tables = True

    count_error = 0
    document_count = 0

    feed_count = -1

    for url in currentFeeds:
        feed_count += 1
        current_feed_document = 0

        currentStories = []
        feed = feedparser.parse(url[1])
        for story in feed.entries:
            current_feed_document += 1

            if story.title.startswith(u'VIDEO:') or story.title.startswith(
                    u'AUDIO'):
                continue
            if story.link in complete_urls:
                continue

            try:
                res = requests.get(story.link)

                html = res.text
                title = story.title.encode('utf-8')

                completion = (
                    (feed_count +
                     (current_feed_document / float(len(feed.entries)))) /
                    (float(len(currentFeeds)))) * 100

                print "[" + ("%.2f" %
                             completion) + "%] \t " + feed.feed.title.encode(
                                 'utf-8') + " - " + title

                raw_text = converter.handle(html)
                if summarize:
                    parser = HtmlParser.from_string(html, None,
                                                    Tokenizer("english"))

                    summarizer = LsaSummarizer(stem_word)
                    summarizer.stop_words = get_stop_words("english")

                    sum_text = [
                        sentence
                        for sentence in summarizer(parser.document, 20)
                    ]
                    raw_text = (" ".join([
                        str(sentence) for sentence in sum_text
                    ])).decode('utf-8')
                    # print raw_text

                stats = TextBlob(raw_text)
                currentStories.append(
                    (title, raw_text, story.link, stats.sentiment,
                     story.published_parsed))
                complete_urls.append(story.link)

                document_count += 1

            except KeyboardInterrupt:
                print "Quitting from Keyboard Interrupt."
                sys.exit(0)
            except:
                count_error += 1
                print "\t Error occurred while processing that story:", sys.exc_info(
                )[0]
                traceback.print_exc()

        raw_documents.append((url[0], currentStories))

    print "Received", document_count, "documents with", count_error, "errors"
    return raw_documents
Exemplo n.º 24
0
def summarize(entry, count):
    clean = lambda sentence: re.sub(r' (?:[;,:.!?])', '', unicode(sentence))
    parser = HtmlParser.from_string(entry.content, entry.url, tokenizer)
    sentences = map(clean, summarizer(parser.document, count))
    return '<ul>{}</ul>'.format(''.join(
        '<li>{}</li>'.format(sentence) for sentence in sentences))
Exemplo n.º 25
0
def crawl():
    print(datetime.now())
    cursor = conn.cursor()
    cursor.execute("SET NAMES utf8mb4")
    cursor.execute(
        'select id, name, feedUrl, lang, form, content_rss from sources where mod(id, 30)=mod(%s, 30)',
        (datetime.now().minute, ))
    sources = cursor.fetchall()
    start = time.clock()
    for source in sources:
        # if source['id']%30 == datetime.now().minute%30:
        print(source[0])
        source = {
            'id': source[0],
            'name': source[1],
            'feedUrl': source[2].replace("188.131.178.76", "127.0.0.1"),
            'lang': source[3],
            'form': source[4],
            'content_rss': source[5]
        }
        print(source['name'])
        LANGUAGE = 'chinese'
        if source['lang'] == 2:
            LANGUAGE = 'english'
        items = feedparser.parse(source['feedUrl'])['items']
        for item in items:
            cursor.execute('select 1 from entries where link = %s limit 1',
                           (item['link'], ))
            results = cursor.fetchall()
            if (not results) or (len(results) == 0):
                entry = {
                    'title':
                    item['title'],
                    'link':
                    item['link'],
                    'source_id':
                    source['id'],
                    'source_name':
                    source['name'],
                    'time':
                    datetime.fromtimestamp(mktime(item['published_parsed'])) +
                    timedelta(hours=TZ_DELTA),
                    'crawl_time':
                    datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    'photo':
                    '',
                    'lang':
                    source['lang'],
                    'author':
                    item['author'],
                    'description':
                    '',
                    'digest':
                    '',
                    'content':
                    ''
                }
                if 'content' in item:
                    entry['content'] = item['content'][0]['value']
                if entry['content'] == '':
                    entry['content'] = item['summary']
                if entry['content'] != '':
                    entry['photo'] = getImg(entry['content'])
                if source['form'] == 1:
                    if source['content_rss'] == 1 and entry['content'] != '':
                        parser = HtmlParser.from_string(
                            entry['content'], "", Tokenizer(LANGUAGE))
                        stemmer = Stemmer(LANGUAGE)
                        summarizer = Summarizer(stemmer)
                        summarizer.stop_words = get_stop_words(LANGUAGE)
                        for sentence in summarizer(parser.document,
                                                   SENTENCES_COUNT):
                            entry['digest'] += str(sentence)
                            if len(entry['digest']) >= 500:
                                break

                    else:
                        parser = HtmlParser.from_url(entry['link'],
                                                     Tokenizer(LANGUAGE))
                        stemmer = Stemmer(LANGUAGE)
                        summarizer = Summarizer(stemmer)
                        summarizer.stop_words = get_stop_words(LANGUAGE)
                        for sentence in summarizer(parser.document,
                                                   SENTENCES_COUNT):
                            entry['digest'] += str(sentence)
                            if len(entry['digest']) >= 500:
                                break
                    entry['digest'] = entry['digest'][0:500]
                cursor.execute(add_entry, entry)
                conn.commit()
        # print(d['feed']['title'])
    elapsed = time.clock() - start
    print('time used: ' + str(elapsed))

    # 关闭Cursor和Connection:
    cursor.close()
Exemplo n.º 26
0
def download_sources(summarize=True, sources=currentFeeds):
    raw_documents = []
    complete_urls = []

    # Download News Stories
    converter = html2text.HTML2Text()
    converter.ignore_links = True
    converter.ignore_images = True
    converter.bypass_tables = True

    count_error = 0
    document_count = 0

    feed_count = -1

    for url in currentFeeds:
        feed_count += 1
        current_feed_document = 0

        currentStories = []
        feed = feedparser.parse(url[1])
        for story in feed.entries:
            current_feed_document += 1

            if story.title.startswith(u'VIDEO:') or story.title.startswith(u'AUDIO'):
                continue
            if story.link in complete_urls:
                continue

            try:
                res = requests.get(story.link)

                html = res.text
                title = story.title.encode('utf-8')
                
                completion = ((feed_count + (current_feed_document / float(len(feed.entries)))) / (float(len(currentFeeds))))* 100
                
                print "[" + ("%.2f" % completion) + "%] \t " + feed.feed.title.encode('utf-8') + " - " + title

                raw_text = converter.handle(html)
                if summarize:
                    parser = HtmlParser.from_string(html, None, Tokenizer("english"))
                
                    summarizer = LsaSummarizer(stem_word)
                    summarizer.stop_words = get_stop_words("english")

                    sum_text = [sentence for sentence in summarizer(parser.document, 20)]
                    raw_text = (" ".join([str(sentence) for sentence in sum_text])).decode('utf-8')
                    # print raw_text

                stats = TextBlob(raw_text)
                currentStories.append((title, raw_text, story.link, stats.sentiment, story.published_parsed))
                complete_urls.append(story.link)

                document_count += 1

            except KeyboardInterrupt:
                print "Quitting from Keyboard Interrupt."
                sys.exit(0)
            except:
                count_error += 1
                print "\t Error occurred while processing that story:", sys.exc_info()[0]
                traceback.print_exc()

        raw_documents.append((url[0], currentStories))

    print "Received", document_count, "documents with", count_error, "errors"
    return raw_documents
Exemplo n.º 27
0
 def summarize_html(self, content):
     parser = HtmlParser.from_string(content, "", Tokenizer(self.lang))
     return self.__summarize(content, parser)