예제 #1
0
    def insertSumma(self):

        lexrank = LexRank()
        curs = self.conn.cursor()

        summa_total = ""
        sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " WHERE sentence_cnt > 5  ORDER BY id ASC;"
        curs.execute(sql)
        rows = curs.fetchall()

        for row in rows:
            raw = row[1]
            sentence_cnt = row[2]
            text = "".join(raw)

            if (lexrank.summarize(text) == 0):
                continue

            if (sentence_cnt <= 20):
                summaries = lexrank.probe(5)
            else:
                summaries = lexrank.probe(10)

            for summa in summaries:
                summa_total += "".join(summa)
                summa_total += ". "

            curs.execute("UPDATE " + self.table + " SET summa=%s WHERE id=%s;",
                         (summa_total, row[0]))
            self.conn.commit()
            summa_total = ""
예제 #2
0
    def post(self):
        import jpype
        if jpype.isJVMStarted():
            jpype.attachThreadToJVM()
        result = []
        corpus = self.params['corpus']

        if corpus:
            result_count = self.params['result_count']
            data_save = self.params['data_save']

            if data_save.upper() == 'Y':
                self.getDataApi().WordPool.add_word_pool(corpus)

            config = {
                # 'useful_tags': ['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi',
                #                 'Eomi', 'Suffix', 'Alpha', 'Number'],
                'useful_tags': ['Noun', 'ProperNoun'],
                'min_token_length': 5
            }

            lexRank = LexRank(**config)
            lexRank.summarize(corpus)

            result_count = min(result_count, lexRank.num_sentences - 1)
            if result_count == 0:
                result_count = 1

            result = lexRank.probe(result_count)

        return result
예제 #3
0
def lexrank(rst):
    print("=" * 20)
    print(len(rst))
    print(rst[0])
    print("=" * 20)
    lexInputText = ""
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
    for news in rst:
        lexInputText += str(news['id'])
        parse = hangul.sub('', news['title'])
        parse = re.sub(' +', ' ', parse).strip()
        parse = re.sub('\.', '', parse)
        lexInputText += parse
        parse = hangul.sub('', news['content'])
        parse = re.sub(' +', ' ', parse).strip()
        parse = re.sub('\.', '', parse)
        lexInputText += parse
        lexInputText += ". "

    print("=" * 10 + "LEXRANK" + "=" * 10)
    lexrank = LexRank()
    lexrank.summarize(lexInputText)
    summaries = lexrank.probe(10)
    searchId = re.compile('[0-9]{5,6}')
    idList = []
    for summary in summaries:
        idList.append(searchId.search(summary).group())
    return (idList)
예제 #4
0
    def insertSumma(self):

        lexrank = LexRank()
        curs = self.conn.cursor()

        summa_total = ""
        sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " ORDER BY id ASC limit 3;"
        curs.execute(sql)
        rows = curs.fetchall()
        for row in rows:
            print("insert summa to id=" + str(row[0]))
            text = "".join(row[1])
            if (lexrank.summarize(text) == 0):
                continue

            if (row[2] <= 20):
                summaries = lexrank.probe(5)
            else:
                summaries = lexrank.probe(10)

            for summa in summaries:
                summa_total += "".join(summa)
                summa_total += ". "

            print(" summa = " + summa_total + "\n")

            curs.execute("UPDATE " + self.table + " SET summa=%s WHERE id=%s;",
                         (summa_total, row[0]))
            self.conn.commit()
            summa_total = ""
예제 #5
0
def summarize_text_with_lexrank(text):
    lexrank = LexRank(n_clusters=1)
    lexrank.summarize(text)
    summaries = lexrank.probe(3)

    if len(summaries) == 0:
        return ""

    return ". ".join(summaries)
예제 #6
0
	def summarize(self, n_summary):
		print("summarize in %d sentences" %n_summary)
		lexrank = LexRank()
		lexrank.summarize(" ".join(self.preprocessed))
		summaries = lexrank.probe(n_summary)  
		# `num_summaries` can be `None` (using auto-detected topics)

		for summary in summaries:
			print(summary)
		self.summaries = summaries
		return summaries
예제 #7
0
파일: test.py 프로젝트: DSLDataStorage/KETI
class TestLexRank(unittest.TestCase):

    def setUp(self):
        self.text = "사과 배 감 귤. 배 감 귤 수박. 감 귤 수박 딸기. 오이 참외 오징어. 참외 오징어 달팽이."
        self.lexrank = LexRank(min_keyword_length=0, no_below_word_count=0, min_cluster_size=1)

    def test_summarized(self):
        self.lexrank.summarize(self.text)
        summaries = self.lexrank.probe()
        self.assertEqual(len(summaries), 2)
        self.assertEqual(summaries[0], "배 감 귤 수박")
예제 #8
0
    def sum_talker(self, cluster):

        per_talker_dict = {}
        lexrank = LexRank(clustering=None)
        for sen in cluster.sentences:
            if sen.talker not in per_talker_dict:
                per_talker_dict[sen.talker] = ""
            per_talker_dict[sen.talker] += sen.text+"\n"

        for k,v in per_talker_dict.items():
            lexrank.summarize(v)
            self.sum_per_talker[k].append(lexrank.probe(1))
예제 #9
0
def wordRank(text):
    text = text.replace('\\n', '.')
    new = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text)

    lexrank = LexRank()
    lexrank.summarize(new)
    summaries = lexrank.probe(3)
    word = Twitter()
    out = []
    for summary in summaries:
        out += word.nouns(summary)

    out = list(set(out))
    print(out)
예제 #10
0
def imageSumalyze(pk):
    post = get_object_or_404(ImagePost, pk=pk)
    from lexrankr import LexRank
    lexrank = LexRank()
    path = os.path.dirname(os.path.dirname(
        os.path.abspath(__file__))) + '\\media\\' + str(post.pdf)
    text = img2Text(path)
    os.remove(path)
    chunk = []
    if len(text) < 650:
        chunk.append(text)
    else:
        chunk = list(map(''.join, zip(*[iter(text)] * 650)))
    chunk2 = []
    idxToDB = ''
    #요약 적용
    idx = 0
    while idx != (len(chunk)):
        try:
            lexrank.summarize(chunk[idx])
            summaries = lexrank.probe(3)
        except:
            idx += 1
            continue
        summaries[0] = summaries[0] + '. '
        summaries[1] = summaries[1] + '. '
        summaries[2] = summaries[2] + '. '
        indexStr = ''.join(summaries)
        chunk[idx] = indexStr
        idxToDB += ibmIndex(indexStr, summaries)
        idxToDB += '#'
        chunk2.append(chunk[idx])
        idx += 1
    post.index = idxToDB

    chunk = []
    chunkToDB = ''
    for c in chunk2:
        chunkToDB += c + '\n'

    post.content = chunkToDB
    post.pdf = None
    post.index = idxToDB
    # 요약본이 아닌 원본으로 ibm Natural Language Understanding
    post.keyword, post.relevance, post.category_ibm = ibmContent(text)
    post.save()
예제 #11
0
def lexlank_function(result):
    # 참조 : https://wikidocs.net/72820
    # LexRank : https://github.com/theeluwin/lexrankr
    try:
        lexrank = LexRank()
        lexrank.summarize(result)

        summarize_data = []
        print("요약 진행중!")
        summaries = lexrank.probe(10)
        for i, summary in enumerate(summaries):
            summarize_data.append(summary)

        return summarize_data
    except:
        print("요약 내용이 부족합니다.")
        return []
예제 #12
0
파일: tasks.py 프로젝트: Seoyoung2/Sumalyze
def videoSumalyze(pk):
    post = get_object_or_404(VideoPost, pk=pk)
    lang = 'en-US'
    if post.lang == '한국어':
        lang = 'ko-KR'
    from lexrankr import LexRank
    lexrank = LexRank()
    chunk = []
    chunk2 = []
    url = post.url
    path = youtubeToAudio(url)
    #post.title = re.sub('[^가-힣\\s]', '', str(path))
    chunk = splitandSTT(path, lang)
    text = " ".join(chunk)
    os.remove(path + '.mp3')
    os.remove(path)
    idxToDB = ''
    #요약 적용
    idx = 0
    while idx != (len(chunk)):
        try:
            lexrank.summarize(chunk[idx])
            summaries = lexrank.probe(3)
        except:
            idx += 1
            continue
        summaries[0] = summaries[0] + '. '
        summaries[1] = summaries[1] + '. '
        summaries[2] = summaries[2] + '. '
        indexStr = ''.join(summaries)
        chunk[idx] = indexStr
        idxToDB += ibmIndex(indexStr, summaries)
        idxToDB += '#'
        chunk2.append(chunk[idx])
        idx += 1
    post.index = idxToDB

    chunk = []
    chunkToDB = ''
    for c in chunk2:
        chunkToDB += c + '\n'

    post.content = chunkToDB
    # 요약본이 아닌 원본으로 ibm Natural Language Understanding
    post.keyword, post.relevance, post.category_ibm = ibmContent(text)
    post.save()
예제 #13
0
    def summarize(self):
        lex = LexRank()
        lex.summarize(self.paragraph)

        if len(self.paragraph) < 100:
            self.probe_num = 1
        elif len(self.paragraph) < 200:
            self.probe_num = 2
        elif len(self.paragraph) < 300:
            self.probe_num = 3
        elif len(self.paragraph) < 400:
            self.probe_num = 4
        else:
            self.probe_num = 5

        summaries = lex.probe(self.probe_num)
        return summaries
예제 #14
0
class LexRankforSummarization():
    def __init__(self):

        self.lexrank = LexRank(min_keyword_length=0,
                               no_below_word_count=0,
                               min_cluster_size=1)

    def test_summarized(self, text):
        self.lexrank.summarize(text)
        try:
            summaries = self.lexrank.probe(3)
        except:
            summaries = self.lexrank.probe(2)
        result = []
        for summary in summaries:
            result.append(summary)
        # print("result",result)
        return result
예제 #15
0
def smry(q):
    while True:
        try:
            print("smry start")
            global count
            data = q.get()
            count += 1
            #print("get{}!".format(count))
            lexrank = LexRank()
            lexrank.summarize(data[4]) #data[4] (본문)가져와서 요약
            summaries = lexrank.probe(3) #3줄요약, summaries 타입은 list
            data[4] = '. '.join(summaries)+'.' #요약된 내용 다시 .으로 join후 저장
            print(data) #db에 저장되어야 하는 최종 결과
            db_store(data)
            # for summary in summaries:
            #     print(summary)
        except (IndexError,ValueError,AttributeError):
            pass
예제 #16
0
파일: tasks.py 프로젝트: Seoyoung2/Sumalyze
def audioSumalyze(pk):
    post = get_object_or_404(AudioPost, pk=pk)
    lang = 'en-US'
    if post.lang == '한국어':
        lang = 'ko-KR'
    from lexrankr import LexRank
    lexrank = LexRank()
    chunk2 = []
    chunk = []

    #speechtotext(str(post.pdf), lang, chunk)
    chunk = splitandSTT(path + str(post.pdf), lang)
    text = " ".join(chunk)
    idxToDB = ''
    #요약 적용
    idx = 0
    while idx != (len(chunk)):
        try:
            lexrank.summarize(chunk[idx])
            summaries = lexrank.probe(3)
        except:
            idx += 1
            continue
        summaries[0] = summaries[0] + '. '
        summaries[1] = summaries[1] + '. '
        summaries[2] = summaries[2] + '. '
        indexStr = ''.join(summaries)
        chunk[idx] = indexStr
        idxToDB += ibmIndex(indexStr, summaries)
        idxToDB += '#'
        chunk2.append(chunk[idx])
        idx += 1
    post.index = idxToDB

    chunk = []
    chunkToDB = ''
    for c in chunk2:
        chunkToDB += c + '\n'

    post.content = chunkToDB
    # 요약본이 아닌 원본으로 ibm Natural Language Understanding
    post.keyword, post.relevance, post.category_ibm = ibmContent(text)
    post.save()
예제 #17
0
def textSumalyze(pk):
    post = get_object_or_404(TextPost, pk=pk)
    from lexrankr import LexRank
    lexrank = LexRank()
    text = post.text
    chunk = []
    if len(text) < 650:
        chunk.append(text)
    else:
        chunk = list(map(''.join, zip(*[iter(text)] * 650)))
    chunk2 = []
    idxToDB = ''
    #요약 적용
    idx = 0
    while idx != (len(chunk)):
        try:
            lexrank.summarize(chunk[idx])
            summaries = lexrank.probe(3)
        except:
            idx += 1
            continue
        summaries[0] = summaries[0] + '. '
        summaries[1] = summaries[1] + '. '
        summaries[2] = summaries[2] + '. '
        indexStr = ''.join(summaries)
        chunk[idx] = indexStr
        idxToDB += ibmIndex(indexStr, summaries)
        idxToDB += '#'
        chunk2.append(chunk[idx])
        idx += 1
    post.index = idxToDB

    chunk = []
    chunkToDB = ''
    for c in chunk2:
        chunkToDB += c + '\n'

    post.content = chunkToDB
    post.text = 'clear'
    # 요약본이 아닌 원본으로 ibm Natural Language Understanding
    post.keyword, post.relevance, post.category_ibm = ibmContent(text)
    post.save()
def lexranker(text,code,date):

    text = text.replace('\\n','.')
    text2 = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text)

    lexrank =LexRank()
    #print(text2)
    lexrank.summarize(text2)
    summaries = lexrank.probe(3)
    word = Twitter()
    out = []
    print(summaries)
    for summary in summaries:
        out += word.nouns(summary)

    word = list(set(out))
    share = Share(code)
    startprice = share.get_open()
    endprice = share.get_price()
    for part in word:
        save_record(part, code, startprice, endprice, date)
예제 #19
0
def post():
    try:
        start = time()
        parser = reqparse.RequestParser()
        parser.add_argument('contents', type=str)
        args = parser.parse_args()

        contents = args['contents']
        print("==== contents ==== : ", contents)

        print(contents)

        # lexrank.summarize(contents)
        # originalText = "이 업체 물건 절대 사지마세요. 형편없습니다. 직원들이 고객은 안중에도 없습니다. 열 받게 만드는 데 선수입니다. 박지원 직원인지 대표인지 몰라도 열받게 하는 제주가 보통이 넘습니다. 다시 한 번 강조하지만 절대 네버네버 스위트피 사지 마세요. 절대 비추입니다."
        lex = LexRank()
        lex.summarize(contents)

        sum = lex.probe(2)

        print(lex.probe(2))

        strd = sum[1] + " " + sum[0]

        resp = {"sum": strd}

        return jsonify(resp)
    except Exception as e:
        return {'error': str(e)}
예제 #20
0
	def highlight(self, threshold=0.5):

		print("highlight : return list of chats and scores ")

		lexrank = LexRank()
		
		lexrank.summarize(" ".join(self.preprocessed))
		lexrank_sentences = [x.text for x in lexrank.sentences]

		scores = lexrank.sentence_score_pair()  
		
		preprocessed = self.preprocessed[:]
		preprocessed = [x.strip().strip(".").strip() for x in preprocessed]
		lex_idx = 0 
		skip_amount = 0
		jump = 0 
		for ts_sentence in self.ts.chat_to_sentence_mapping:
			ts_idx, chat_idxs, sentence = ts_sentence
			
			if lex_idx >= len(scores): break
			
			if len(sentence.strip()) == 0 :
				jump += 1
			else :
				if preprocessed[lex_idx + skip_amount] != scores[lex_idx][1] :
					skip_amount += 1 
				else :					
					scores[lex_idx] = list(scores[lex_idx])
					scores[lex_idx][0] = lex_idx + jump + skip_amount
					scores[lex_idx].append(chat_idxs)
					lex_idx += 1; 
		self.highlight_lexrank = scores[:]
		print("highlight result")
		return_list = self._map_to_chat(self.highlight_lexrank)
		for chat in return_list :
			if chat[0] == 1 : 
				print(chat)
		return return_list
예제 #21
0
 def insertSentenceCount(self):
     
     lexrank = LexRank()
     curs = self.conn.cursor()
     
     sql = "SELECT id,raw FROM "+ self.table +" ORDER BY id ASC;"
     curs.execute(sql)
     rows = curs.fetchall()
     
     for row in rows:
         text="".join(row[1])
         sc = lexrank.factory.text2sentences(text)
         print("insert sentence_cnt to id="+str(row[0]))
         print("sc="+str(len(sc))+"\n")
         curs.execute("UPDATE "+ self.table +" SET sentence_cnt=%s WHERE id=%s;",(str(len(sc)),str(row[0])))
         self.conn.commit()
예제 #22
0
 def sum_cluster(self):
     self.clusters = self.clustering()
     lexrank = LexRank(clustering=None)
     self.summaries = []
     for c in self.clusters:
         lexrank.summarize(c.sen2txt())
         for i, s in enumerate(c.sentences):
             print(i, s.text)
         if len(c.sentences)<20:
             self.summaries.append(lexrank.probe(0.3))
         else:
             self.summaries.append(lexrank.probe(0.2))
         self.sum_talker(c) # use current cluster to summarize per talker
예제 #23
0
def getsentence(table):
    
    conn = pymysql.connect(host='localhost',
                           user='******', 
                           password='******',
                           db='KETI',
                           use_unicode=True,
                           charset='utf8') 
    lexrank = LexRank()
    curs = conn.cursor()
        
    #sql = "SELECT id,raw,sentence_cnt FROM "+ table +" where id>179460;"
    sql = "select count(*)*0.1 from "+ table +";"
    curs.execute(sql)
    rows = curs.fetchone()
    percentile = int(rows[0])
    
    sql = "SELECT id,raw,sentence_cnt FROM "+ table +" order by sentence_cnt desc;"
    curs.execute(sql)
    rows = curs.fetchall()
    
    i=0
    
    for row in rows:
       i = i+1
       if i == percentile : 
           percentile_cnt = row[2]
           break;
     
    sql = "SELECT avg(sentence_cnt) FROM "+ table +" where sentence_cnt <="+str(percentile_cnt) +";"
    curs.execute(sql)
    rows = curs.fetchone()
    percentile_avg = int(rows[0])
        
    print i, percentile_cnt, percentile_avg
    ''' 
예제 #24
0
    def makeEvents(self, summaries, rows, x_event):

        lexrank = LexRank()
        events = []
        events_size = 0
        for summa in summaries:

            find = False
            text = {}
            summa = str(summa)
            db_date = ""
            if (summa.find("기자]") != -1):
                summa = self.deleteNoise(summa)

            for row in rows:

                # When Summa value is none
                if (row[2] is None):
                    temp = "".join(row[1])
                    if (temp.find("".join(summa)) != -1):
                        find = True

                        db_raw = str(row[1])
                        db_summa = str(row[2])
                        db_title = str(row[3])
                        db_date = str(row[4])
                        db_sentenceCnt = str(row[6])
                        db_link = str(row[7])

                        if (db_raw.find("기자]") != -1):
                            db_raw = self.deleteNoise(db_raw)
                        if (db_summa.find("기자]") != -1):
                            db_summa = self.deleteNoise(db_summa)
                        if (db_title.find("기자]") != -1):
                            db_title = self.deleteNoise(db_title)

                        text["headline"] = db_title
                        text["text"] = db_raw
                        text["text2"] = db_raw
                        if (db_sentenceCnt > self.summa_count):
                            raw = "".join(row[1])
                            lexrank.summarize(raw)
                            raw_summaries = lexrank.probe(self.summa_count)
                            text3_summa = ""

                            for raw_summa in raw_summaries:
                                text3_summa += str(raw_summa) + ". "

                            if (text3_summa.find("기자]") != -1):
                                text3_summa = self.deleteNoise(text3_summa)
                            text["text3"] = text3_summa
                        else:
                            text["text3"] = ""
                        break

                # When Summa value isn't none
                else:
                    temp = "".join(row[2])
                    if (temp.find("".join(summa)) != -1):
                        find = True

                        db_raw = str(row[1])
                        db_summa = str(row[2])
                        db_title = str(row[3])
                        db_date = str(row[4])
                        db_sentenceCnt = str(row[6])
                        db_link = str(row[7])

                        if (db_raw.find("기자]") != -1):
                            db_raw = self.deleteNoise(db_raw)
                        if (db_summa.find("기자]") != -1):
                            db_summa = self.deleteNoise(db_summa)
                        if (db_title.find("기자]") != -1):
                            db_title = self.deleteNoise(db_title)

                        text["headline"] = db_title
                        text["text"] = db_raw
                        text["text2"] = db_summa

                        if (db_sentenceCnt > self.summa_count):
                            raw = "".join(row[1])
                            lexrank.summarize(raw)
                            raw_summaries = lexrank.probe(self.summa_count)
                            text3_summa = ""

                            for raw_summa in raw_summaries:
                                text3_summa += str(raw_summa) + ". "

                            if (text3_summa.find("기자]") != -1):
                                text3_summa = self.deleteNoise(text3_summa)
                            text["text3"] = text3_summa
                        else:
                            text["text3"] = ""
                        break

            if (find == True):
                event = {
                    "start_date": {
                        "year": db_date.split('-')[0],
                        "month": db_date.split('-')[1],
                        "day": db_date.split('-')[2],
                        "hour": "",
                        "minute": "",
                        "second": "",
                        "millisecond": "",
                        "format": ""
                    },
                    "group": db_date.split('-')[1],
                    "media": {
                        "caption": "",
                        "credit": "",
                        "url": db_link,
                        "thumb": ""
                    }
                }
                event["text"] = text
                events.append(event)
                events_size += 1

            if (events_size == x_event):
                break

        return events
예제 #25
0
from __future__ import print_function
from lexrankr import LexRank

# 이거 아님

lexrank = LexRank()  # can init with various settings
lexrank.summarize('''
경찰이 잠실야구장에서 벌어진 '현대판 노예사건'에 대해 본격 수사에 착수했다. 앞서 서울시장애인인권센터는 이곳 분리수거장에서 A씨(60)를 구조하고, 그가 임금을 받지 못한 채 노예처럼 일해온 것으로 추정된다며 수사를 의뢰했다.

서울 송파경찰서는 A씨를 고용한 고물업체 사장 B씨를 최근 불구속 입건하고 인권센터가 제기한 사기·폭행·가혹행위 등의 의혹에 대해 수사 중이라고 20일 밝혔다.

인권센터와 경찰 등에 따르면 B씨는 지난 17년 동안 A씨에게 제대로 된 임금을 주지 않고 분리수거 업무를 시킨 의혹을 받는다. 

특히 인권센터는 A씨가 떨어진 빵으로 끼니를 채우면서 하루 16시간 동안 일하는 등 노동착취를 당했다고 주장했다. 지적장애를 가진 A씨는 분리수거장 내부 컨테이너에서 지금껏 생활해온 것으로 알려졌다.

현재 경찰은 인권센터와 언론에서 제기한 의혹에 대해 사실관계를 확인하는 중이다. 피해자와 고물업체 사장, 잠실야구장 시설관리자 등에 대해 한차례씩 조사도 진행했다. 

경찰관계자는 "피해자의 계좌 입출금 내역을 분석하고 있다"며 "학대 등의 혐의가 있는지 확인하기 위해 추가로 관계자들을 조사할 계획"이라고 밝혔다. 
''')
summaries = lexrank.probe(
    1)  # `num_summaries` can be `None` (using auto-detected topics)
for summary in summaries:  # 그냥 출력
    print(summary)

from firebase import firebase
firebase = firebase.FirebaseApplication('https://chatbot-c6606.firebaseio.com')

result = firebase.post('/', {'news': summary})
print(result)
예제 #26
0
def simpleLexrank(arg):
    lexrank = LexRank()
    lexrank.summarize(arg)
    summaries = lexrank.probe(3)
    return (summaries)
예제 #27
0
    def createJSON(self):

        start_time_createJSON = time.time()
        if (self.main_event > len(self.main_idx)):
            sys.exit("main_event must be smaller than main_idx")
        elif ((self.total_event - self.main_event) > len(self.outlier_idx)):
            sys.exit("total_event-main_event must be smaller than outlier_idx")

        lexrank = LexRank()  # can init with various settings
        curs = self.conn.cursor()

        ##############################################################################
        # MAKE TITLE #
        ##############################################################################
        main_outlier_text = ""

        main_outlier_idx = self.main_idx + self.outlier_idx
        main_outlier_idx_size = len(main_outlier_idx)

        tag = ""
        main_outlier_sentence_cnt = 0

        main_outlier_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in("

        for i in range(main_outlier_idx_size):
            if (i != (main_outlier_idx_size - 1)):
                main_outlier_sql += str(main_outlier_idx[i]) + ","
            else:
                main_outlier_sql += str(main_outlier_idx[i]) + ");"

        curs.execute(main_outlier_sql)
        main_outlier_rows = curs.fetchall()

        for row in main_outlier_rows:
            tag += "".join(row[5])
            main_outlier_sentence_cnt += row[6]
            if (row[2] is None):
                main_outlier_text += "".join(row[1])
            else:
                main_outlier_text += "".join(row[2])
        #main_outlier_tag=self.selectTag(tag)

        print(
            "===============================================================")
        print(
            "====================== START CREATE_JSON ======================")
        print(
            "===============================================================\n"
        )

        start_time = time.time()
        lexrank.summarize(main_outlier_text)
        print("first total : ")
        print("     %.02f sec" % (time.time() - start_time))

        summaries = lexrank.probe(self.total_event)

        find = 0
        db_link = ""
        main_outlier_summa = ""
        for summa in summaries:

            summa = str(summa)
            if (summa.find("기자]") != -1):
                summa = self.deleteNoise(summa)

            for row in main_outlier_rows:
                if (row[2] is None):
                    temp = "".join(row[1])
                    if (temp.find("".join(summa)) != -1):
                        db_link = str(row[7])
                        find += 1
                        main_outlier_summa += summa + ". "
                        break
                else:
                    temp = "".join(row[2])
                    if (temp.find("".join(summa)) != -1):

                        db_link = str(row[7])
                        find += 1
                        main_outlier_summa += summa + ". "
                        break

            if (find == (1)):
                break

        main_outlier_tag = self.selectTag(tag, main_outlier_summa)

        total_data = OrderedDict()

        total_data["scale"] = ""
        total_data["title"] = {
            "media": {
                "caption": "",
                "credit": "",
                "url": db_link,
                "thumb": ""
            },
            "text": {
                "headline": main_outlier_tag,
                "text": main_outlier_summa
            }
        }
        ##############################################################################
        # MAKE TITLE # END
        ##############################################################################

        ##############################################################################
        # MAKE EVENTS # START
        ##############################################################################
        events = []

        main_idx_size = len(self.main_idx)
        main_text = ""

        outlier_idx_size = len(self.outlier_idx)
        outlier_text = ""

        main_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in("
        for i in range(main_idx_size):
            if (i != (main_idx_size - 1)):
                main_sql += str(self.main_idx[i]) + ","
            else:
                main_sql += str(self.main_idx[i]) + ");"

        curs.execute(main_sql)
        main_rows = curs.fetchall()
        for row in main_rows:
            if (row[2] is None):
                main_text += "".join(row[1])
            else:
                main_text += "".join(row[2])

        start_time = time.time()
        lexrank.summarize(main_text)
        start_time = time.time()
        summaries = self.getDistinctArticleSummaries(lexrank, main_rows,
                                                     self.main_event)

        start_time = time.time()

        events += self.makeEvents(summaries, main_rows, self.main_event)

        outlier_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in("
        for i in range(outlier_idx_size):
            if (i != (outlier_idx_size - 1)):
                outlier_sql += str(self.outlier_idx[i]) + ","
            else:
                outlier_sql += str(self.outlier_idx[i]) + ");"

        curs.execute(outlier_sql)
        outlier_rows = curs.fetchall()
        for row in outlier_rows:
            if (row[2] is None):
                outlier_text += "".join(row[1])
            else:
                outlier_text += "".join(row[2])

        start_time = time.time()

        lexrank.summarize(outlier_text)

        summaries = self.getDistinctArticleSummaries(
            lexrank, outlier_rows, self.total_event - self.main_event)

        events += self.makeEvents(summaries, outlier_rows,
                                  self.total_event - self.main_event)

        total_data["events"] = events

        print("     <TOTAL TIME> ")
        print("     %.02f sec" % (time.time() - start_time_createJSON) + "\n")
        ##############################################################################
        # MAKE EVENTS # END
        ##############################################################################

        numTag = 3
        fileTag = ""
        for i in range(0, numTag):
            fileTag += "".join(main_outlier_tag.split(' ')[i])
            if (i != numTag - 1):
                fileTag += "-"

        file_path = "JSON/" + fileTag + "_" + str(self.total_event) + ".json"
        with codecs.open(file_path, 'w', encoding="utf-8") as make_file:
            json.dump(total_data,
                      make_file,
                      ensure_ascii=False,
                      indent=4,
                      sort_keys=False)
예제 #28
0
파일: test.py 프로젝트: DSLDataStorage/KETI
 def setUp(self):
     self.text = "사과 배 감 귤. 배 감 귤 수박. 감 귤 수박 딸기. 오이 참외 오징어. 참외 오징어 달팽이."
     self.lexrank = LexRank(min_keyword_length=0, no_below_word_count=0, min_cluster_size=1)
예제 #29
0
from __future__ import print_function
from lexrankr import LexRank
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import sys

filePath = sys.argv[1]
probNumber = sys.argv[2]

lexrank = LexRank()

all_txt = []
with open(filePath, "r", encoding="UTF-8") as f:
    all_txt = f.readlines()

all_string = ""
for i in all_txt:
    all_string += i
with open("sentence_set.txt", "r") as f:
    sent_analy = f.readlines()
    for i in range(len(sent_analy)):
        sent_analy[i] = sent_analy[i].split(' ')
        for j in range(len(sent_analy[i])):
            if '\n' in sent_analy[i][j]:
                sent_analy[i][j] = sent_analy[i][j].replace('\n', '')

token_all_string = pos_tag(word_tokenize(all_string))

process_str = ""
index = 0
while (len(token_all_string)):
예제 #30
0
from flask import Flask, request, jsonify
from flask_restful import Resource, Api
from flask_restful import reqparse
from lexrankr import LexRank
from time import time

app = Flask(__name__)
api = Api(app)
app.config['JSON_AS_ASCII'] = False

lexrank = LexRank()  # can init with various settings


@app.route('/summary', methods=['POST'])
def post():
    try:
        start = time()
        parser = reqparse.RequestParser()
        parser.add_argument('contents', type=str)
        args = parser.parse_args()

        contents = args['contents']
        print("==== contents ==== : ", contents)

        print(contents)

        # lexrank.summarize(contents)
        # originalText = "이 업체 물건 절대 사지마세요. 형편없습니다. 직원들이 고객은 안중에도 없습니다. 열 받게 만드는 데 선수입니다. 박지원 직원인지 대표인지 몰라도 열받게 하는 제주가 보통이 넘습니다. 다시 한 번 강조하지만 절대 네버네버 스위트피 사지 마세요. 절대 비추입니다."
        lex = LexRank()
        lex.summarize(contents)