Python Twitter.Twitter示例，konlpy.tag.Twitter.Twitter Python示例

示例#1

0

显示文件

def main():
    #   Arguments  #
    parser = argparse.ArgumentParser(
        description='Pengtai Instagram RNN LSTM Model')
    parser.add_argument(
        '-t',
        '--type',
        type=str,
        help="run type Options: 'n' for new | 'o' for overwrite",
        default='o',
        nargs='+')
    # parser.add_argument('-d', '--dest_dir', type=str, help='CSV data file')
    parser.add_argument('-i',
                        '--input_dir',
                        type=str,
                        help='Input Raw CSV directory')
    parser.add_argument('-u', '--user_id', type=str, help='Instagram User ID')
    parser.add_argument('-v',
                        '--version',
                        help='current version',
                        action='store_true')

    args = parser.parse_args()
    #  End Argparse #

    # VERSION CONTROL #
    if args.version:
        with open(settings.VERSION_JSON, "r") as jsonFile:
            data = json.load(jsonFile)

        return print(data['version'])

    if args.type:
        if args.type[0] == 'n' and args.type[1]:
            with open(settings.VERSION_JSON, "r") as jsonFile:
                data = json.load(jsonFile)

            data["version"] = args.type[1]

            with open(settings.VERSION_JSON, "w") as jsonFile:
                json.dump(data, jsonFile)

            VERSION = args.type[1]

        elif args.type[0] == 'o':
            with open(settings.VERSION_JSON, "r") as jsonFile:
                data = json.load(jsonFile)

            VERSION = data["version"]

    # End VERSION CONTROL #

    with open('./dic/polarity.csv', 'r', encoding='UTF-8') as file:
        csvreader = csv.DictReader(file)
        kosac = [row for row in csvreader]

    total_arr = []
    rowI = 0
    rowDict = {}

    # File List in the directory from the arguments
    for filename in glob.glob(os.path.join(args.input_dir, '*.csv')):
        # i = ['id', 'img', 'text', 'has_tag', 'write_date', 'reg_date']
        with open(filename, 'r', encoding='UTF-8') as f:
            csvreader = csv.DictReader(f)
            # csvreader = csv.reader(f)
            for row in csvreader:
                if rowI == 0:
                    rowDict = {"user_id": row['user_id'], "posts": []}
                else:
                    # print(user_id, row['user_id'], rowDict)
                    if rowDict['user_id'] != row['user_id']:
                        total_arr.append(rowDict)
                        rowDict = {"user_id": row['user_id'], "posts": []}

                # text preprocess
                text = re.sub(r'@\w+', '', row['text'])
                text = re.sub(
                    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                    '', text)
                text = re.sub(r'[\[]|[\]]', '', text)
                text = re.sub(r'[\r]|[\n]', ' ', text)
                text = re.sub(r'[.]|[ㆍ]', '', text)
                text = re.sub(r'#', ' ', text)

                rowDict['posts'].append({
                    "datetime": row['write_date'],
                    "text": text
                })
                rowI = rowI + 1

    # print(total_arr)
    trg_res = [item for item in total_arr if item["user_id"] == args.user_id]
    temp = []
    kkma = Kkma()
    t = Twitter()

    for post in trg_res[0]['posts']:
        date = datetime.datetime(int(post['datetime'][0:4]),
                                 int(post['datetime'][5:7]),
                                 int(post['datetime'][8:10]),
                                 int(post['datetime'][11:13]),
                                 int(post['datetime'][14:16]),
                                 int(post['datetime'][17:19]))
        text = post['text']
        temp.append((date, text))

    temp = sorted(temp, key=lambda t: t[0], reverse=False)

    sentArr = []
    newArr = []
    tokens_ko = []
    index = 0
    nounsArr = []

    for data in temp:
        sentPosArr = kkma.pos(data[1])
        # sentNouns = kkma.nouns(data[1])

        inArr = []
        for outA in sentPosArr:
            # for inA in outA:
            inArr.append("/".join(outA))

        morph_arr = t.morphs(data[1])
        morphWords = [word for word in morph_arr if not word in tokens_ko]
        for word in morphWords:
            if not word in nounsArr:
                nounsArr.append(word)

        tokens_ko.extend(morphWords)

        newArr.append({"sentence": "", "words": morph_arr, "score": 0})

        index = index + 1
        sentArr.append(";".join(inArr))

    index = 0
    for eaSent in sentArr:
        sentiScore = 0
        for corp in kosac:
            if eaSent.find(corp['ngram']) > -1:
                if corp['max.value'] == 'NEG':
                    sentiScore = sentiScore - float(corp['max.prop'])
                elif corp['max.value'] == 'POS':
                    sentiScore = sentiScore + float(corp['max.prop'])

        newArr[index]["sentence"] = eaSent
        newArr[index]["score"] = sentiScore

        index = index + 1

    # ACO 알고리즘

    # doc_ko = " ".join([row[1] for row in temp])
    # text_arr = [row[1] for row in temp]
    # for text in text_arr:
    #     morph_arr = t.morphs(text)
    #     temp = [word for word in morph_arr if not word in tokens_ko]
    #     tokens_ko.extend(temp)

    print(tokens_ko)
    ko = nltk.Text(tokens_ko)  # For Python 2, input `name` as u'유니코드'

    # # print(len(set(ko.tokens)))  # returns number of unique tokens
    vocab = dict([(item[0], index + 1)
                  for index, item in enumerate(ko.vocab().items())])
    # pprint(vocab)  # returns number of tokens (document length)
    minTimeVal = int(temp[0][0].timestamp())
    maxTimeVal = int(temp[len(temp) - 1][0].timestamp() - minTimeVal)

    tenPow = len(str(int(temp[len(temp) - 1][0].timestamp() - minTimeVal)))
    tenPow = pow(10, tenPow)

    index = 0
    nodes = []

    for data in temp:
        # print(data[0].utctimetuple)
        # print(data[0].time())
        diffTimeVal = int(data[0].timestamp() - minTimeVal)

        opt1 = float(diffTimeVal / tenPow)
        opt2 = float(diffTimeVal / maxTimeVal)
        print(diffTimeVal, opt1, opt2)

        nodes.append((opt2, newArr[index]["words"]))
        index = index + 1

    # print(nounsArr)
    nodes2 = []
    for noun in nounsArr:
        for corp in kosac:
            hts = "%s/NNG" % (noun)
            if hts.find(corp['ngram']) > -1:
                if corp['max.value'] == 'NEG':
                    nodes2.append({
                        "noun": noun,
                        "score": -float(corp['max.prop'])
                    })
                elif corp['max.value'] == 'POS':
                    nodes2.append({
                        "noun": noun,
                        "score": float(corp['max.prop'])
                    })

    print()
    antCount = len(newArr)
    rhoVal = 0.3

    # ACO 알고리즘 예시
    # nodes = []
    # for _ in range(20):
    #     x = random.uniform(-10, 10)
    #     y = random.uniform(-10, 10)
    #     nodes.append((x, y))
    #
    def euclidean(a, b):
        return math.sqrt(pow(a[1] - b[1], 2) + pow(a[0] - b[0], 2))

    #
    world = pants.World(nodes, euclidean)
    #
    solver = pants.Solver(rho=rhoVal, )

示例#2

0

显示文件

文件： test_twitter.py 项目： cjh103201/writeFlaskAPI

def tkorean_instance():
    from konlpy.tag import Twitter
    t = Twitter()
    return t

示例#3

0

显示文件

文件： hyperparam-soft-ensemble.py 项目： jdh3577/korean-nlp

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import randint as sp_randint

train_df = pd.read_csv("dataset/old_binary_train_data.csv", encoding='CP949')

corpus_data = train_df['QAContent_1'].values
corpus_lable = train_df['label'].values

from konlpy.tag import Twitter

mecab = Twitter()


def get_normalized_data(sentence):
    # original_sentence = mecab.pos(sentence, norm=True, stem=True)
    original_sentence = mecab.pos(sentence, norm=True)
    inputData = []
    for w, t in original_sentence:
        if t not in ['Number', 'Punctuation', 'KoreanParticle']:
            inputData.append(w)
    return (' '.join(inputData)).strip()


corpus = [get_normalized_data(i) for i in corpus_data]

stopWords = []

示例#4

0

显示文件

from keras.layers.core import Dense
from keras.layers.embeddings import Embedding
from keras.layers import LSTM
from keras.datasets import imdb

TRAIN_FILENAME = 'ratings_train.txt'
TRAIN_DATA_FILENAME = TRAIN_FILENAME + '.data'
TEST_FILENAME = 'ratings_test.txt'
TEST_DATA_FILENAME = TEST_FILENAME + '.data'

#max_features = 55826
max_features = 56000
maxlen = 100  # cut texts after this number of words
batch_size = 32

pos_tagger = Twitter()
vocab = dict()


def __FUNC__():
    return traceback.extract_stack(None, 2)[0][2]


def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]


def read_raw_data(filename, debug=False):
    with open(filename, 'r', encoding='utf-8') as f:
        print('loading data')
        data = [line.split('\t') for line in f.read().splitlines()]

示例#5

0

显示文件

文件： EX_LDA_score23_count1.py 项目： JeongHyeon-Kim/Extracting-Representative-Keyword-with-LDA-and-Word-Distance

from collections import Counter
from konlpy.tag import Twitter
t = Twitter()
from konlpy.corpus import kolaw
from types import *
import gensim
from gensim.models import LdaModel
from gensim import corpora, models
import MySQLdb
import operator
import decimal
import math
db = MySQLdb.connect(host="localhost",
                     user="******",
                     passwd="kkms1234",
                     db="scraping",
                     charset='utf8')

cursor = db.cursor(MySQLdb.cursors.DictCursor)
cursor2 = db.cursor(MySQLdb.cursors.DictCursor)
cursor3 = db.cursor(MySQLdb.cursors.DictCursor)
cursor4 = db.cursor(MySQLdb.cursors.DictCursor)
cursor5 = db.cursor(MySQLdb.cursors.DictCursor)
cursor6 = db.cursor(MySQLdb.cursors.DictCursor)
cursor7 = db.cursor(MySQLdb.cursors.DictCursor)
cursor8 = db.cursor(MySQLdb.cursors.DictCursor)
cursor9 = db.cursor(MySQLdb.cursors.DictCursor)
cursor10 = db.cursor(MySQLdb.cursors.DictCursor)

cursor.execute("set names utf8")
cursor2.execute("set names utf8")

示例#6

0

显示文件

texts = open(file_name, "r", encoding='euc-kr')

# In[7]:

get_ipython().system('pip install git+https://github.com/ssut/py-hanspell.git')
get_ipython().system('pip install konlpy')
get_ipython().system('pip install krwordrank')

from hanspell import spell_checker
from tqdm.notebook import tqdm
from konlpy.tag import Twitter
from collections import Counter
from krwordrank.hangle import normalize

nlpy = Twitter()

lines = [line.rstrip('\n') for line in texts]  #txt 파일을 개행문자 기준으로 splig

nouns_word = []  #명사 단어 추출
normalized_lines = []
for each_line in tqdm(lines):
    each_line = each_line.replace("\x0c", "")  #json을 로드 하면서 생기는 특수문자 제거
    each_line = normalize(each_line, english=True, number=True)  #특수문자 제거
    each_line = spell_checker.check(each_line).checked  #맞춤법 틀린게 있다면 고쳐줌
    nouns_word = nouns_word + nlpy.nouns(each_line)  # 명사 단어 추출
    normalized_lines.append(each_line)

# In[8]:

normalized_lines

示例#7

0

显示文件

文件： Word2Vec.py 项目： bevisLee/BiRNN

 def tokenize(self, doc):
     pos_tagger = Twitter()
     return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

示例#8

0

显示文件

 def __init__(self):
     self.kkma = Kkma()
     self.twitter = Twitter()
     self.stopwords = [
         '중인', '만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보",
         "기자", "으로", "로", "에게", "뿐이다", "의거하여", "근거하여", "입각하여", "기준으로",
         "예하면", "예를", "들면", "들자면", "저", "소인", "소생", "저희", "지말고", "하지마",
         "하지마라", "다른", "물론", "또한", "그리고", "비길수", "없다", "해서는", "안된다", "뿐만",
         "아니라", "만이", "아니다", "만은", "아니다", "막론하고", "관계없이", "그치지", "않다",
         "그러나", "그런데", "하지만", "든간에", "논하지", "않다", "따지지", "않다", "설사", "비록",
         "더라도", "아니면", "만", "못하다", "하는", "편이", "낫다", "불문하고", "향하여", "향해서",
         "향하다", "쪽으로", "틈타", "이용하여", "타다", "오르다", "제외하고", "이", "외에", "이",
         "밖에", "하여야", "비로소", "한다면", "몰라도", "외에도", "이곳", "여기", "부터", "기점으로",
         "따라서", "할", "생각이다", "하려고하다", "이리하여", "그리하여", "그렇게", "함으로써", "하지만",
         "일때", "할때", "앞에서", "중에서", "보는데서", "으로써", "로써", "까지", "해야한다",
         "일것이다", "반드시", "할줄알다", "할수있다", "할수있어", "임에", "틀림없다", "한다면", "등",
         "등등", "제", "겨우", "단지", "다만", "할뿐", "딩동", "댕그", "대해서", "대하여", "대하면",
         "훨씬", "얼마나", "얼마만큼", "얼마큼", "남짓", "여", "얼마간", "약간", "다소", "좀",
         "조금", "다수", "몇", "얼마", "지만", "하물며", "또한", "그러나", "그렇지만", "하지만",
         "이외에도", "대해", "말하자면", "뿐이다", "다음에", "반대로", "반대로", "말하자면", "이와",
         "반대로", "바꾸어서", "말하면", "바꾸어서", "한다면", "만약", "그렇지않으면", "까악", "툭",
         "딱", "삐걱거리다", "보드득", "비걱거리다", "꽈당", "응당", "해야한다", "에", "가서", "각",
         "각각", "여러분", "각종", "각자", "제각기", "하도록하다", "와", "과", "그러므로", "그래서",
         "고로", "한", "까닭에", "하기", "때문에", "거니와", "이지만", "대하여", "관하여", "관한",
         "과연", "실로", "아니나다를가", "생각한대로", "진짜로", "한적이있다", "하곤하였다", "하", "하하",
         "허허", "아하", "거바", "와", "오", "왜", "어째서", "무엇때문에", "어찌", "하겠는가",
         "무슨", "어디", "어느곳", "더군다나", "하물며", "더욱이는", "어느때", "언제", "야", "이봐",
         "어이", "여보시오", "흐흐", "흥", "휴", "헉헉", "헐떡헐떡", "영차", "여차", "어기여차",
         "끙끙", "아야", "앗", "아야", "콸콸", "졸졸", "좍좍", "뚝뚝", "주룩주룩", "솨", "우르르",
         "그래도", "또", "그리고", "바꾸어말하면", "바꾸어말하자면", "혹은", "혹시", "답다", "및",
         "그에", "따르는", "때가", "되어", "즉", "지든지", "설령", "가령", "하더라도", "할지라도",
         "일지라도", "지든지", "몇", "거의", "하마터면", "인젠", "이젠", "된바에야", "된이상",
         "만큼 어찌됏든", "그위에", "게다가", "점에서", "보아", "비추어", "보아", "고려하면",
         "하게될것이다", "일것이다", "비교적", "좀", "보다더", "비하면", "시키다", "하게하다", "할만하다",
         "의해서", "연이서", "이어서", "잇따라", "뒤따라", "뒤이어", "결국", "의지하여", "기대여",
         "통하여", "자마자", "더욱더", "불구하고", "얼마든지", "마음대로", "주저하지", "않고", "곧",
         "즉시", "바로", "당장", "하자마자", "밖에", "안된다", "하면된다", "그래", "그렇지", "요컨대",
         "다시", "말하자면", "바꿔", "말하면", "즉", "구체적으로", "말하자면", "시작하여", "시초에",
         "이상", "허", "헉", "허걱", "바와같이", "해도좋다", "해도된다", "게다가", "더구나", "하물며",
         "와르르", "팍", "퍽", "펄렁", "동안", "이래", "하고있었다", "이었다", "에서", "로부터",
         "까지", "예하면", "했어요", "해요", "함께", "같이", "더불어", "마저", "마저도", "양자",
         "모두", "습니다", "가까스로", "하려고하다", "즈음하여", "다른", "다른", "방면으로", "해봐요",
         "습니까", "했어요", "말할것도", "없고", "무릎쓰고", "개의치않고", "하는것만", "못하다", "하는것이",
         "낫다", "매", "매번", "들", "모", "어느것", "어느", "로써", "갖고말하자면", "어디",
         "어느쪽", "어느것", "어느해", "어느", "년도", "라", "해도", "언젠가", "어떤것", "어느것",
         "저기", "저쪽", "저것", "그때", "그럼", "그러면", "요만한걸", "그래", "그때", "저것만큼",
         "그저", "이르기까지", "할", "줄", "안다", "할", "힘이", "있다", "너", "너희", "당신",
         "어찌", "설마", "차라리", "할지언정", "할지라도", "할망정", "할지언정", "구토하다", "게우다",
         "토하다", "메쓰겁다", "옆사람", "퉤", "쳇", "의거하여", "근거하여", "의해", "따라", "힘입어",
         "그", "다음", "버금", "두번째로", "기타", "첫번째로", "나머지는", "그중에서", "견지에서",
         "형식으로", "쓰여", "입장에서", "위해서", "단지", "의해되다", "하도록시키다", "뿐만아니라",
         "반대로", "전후", "전자", "앞의것", "잠시", "잠깐", "하면서", "그렇지만", "다음에", "그러한즉",
         "그런즉", "남들", "아무거나", "어찌하든지", "같다", "비슷하다", "예컨대", "이럴정도로", "어떻게",
         "만약", "만일", "위에서", "서술한바와같이", "인", "듯하다", "하지", "않는다면", "만약에",
         "무엇", "무슨", "어느", "어떤", "아래윗", "조차", "한데", "그럼에도", "불구하고", "여전히",
         "심지어", "까지도", "조차도", "하지", "않도록", "않기", "위하여", "때", "시각", "무렵",
         "시간", "동안", "어때", "어떠한", "하여금", "네", "예", "우선", "누구", "누가", "알겠는가",
         "아무도", "줄은모른다", "줄은", "몰랏다", "하는", "김에", "겸사겸사", "하는바", "그런",
         "까닭에", "한", "이유는", "그러니", "그러니까", "때문에", "그", "너희", "그들", "너희들",
         "타인", "것", "것들", "너", "위하여", "공동으로", "동시에", "하기", "위하여", "어찌하여",
         "무엇때문에", "붕붕", "윙윙", "나", "우리", "엉엉", "휘익", "윙윙", "오호", "아하",
         "어쨋든", "만", "못하다    하기보다는", "차라리", "하는", "편이", "낫다", "흐흐", "놀라다",
         "상대적으로", "말하자면", "마치", "아니라면", "쉿", "그렇지", "않으면", "그렇지", "않다면",
         "안", "그러면", "아니었다면", "하든지", "아니면", "이라면", "좋아", "알았어", "하는것도",
         "그만이다", "어쩔수", "없다", "하나", "일", "일반적으로", "일단", "한켠으로는", "오자마자",
         "이렇게되면", "이와같다면", "전부", "한마디", "한항목", "근거로", "하기에", "아울러", "하지",
         "않도록", "않기", "위해서", "이르기까지", "이", "되다", "로", "인하여", "까닭으로",
         "이유만으로", "이로", "인하여", "그래서", "이", "때문에", "그러므로", "그런", "까닭에", "알",
         "수", "있다", "결론을", "낼", "수", "있다", "으로", "인하여", "있다", "어떤것", "관계가",
         "있다", "관련이", "있다", "연관되다", "어떤것들", "에", "대해", "이리하여", "그리하여", "여부",
         "하기보다는", "하느니", "하면", "할수록", "운운", "이러이러하다", "하구나", "하도다", "다시말하면",
         "다음으로", "에", "있다", "에", "달려", "있다", "우리", "우리들", "오히려", "하기는한데",
         "어떻게", "어떻해", "어찌됏어", "어때", "어째서", "본대로", "자", "이", "이쪽", "여기",
         "이것", "이번", "이렇게말하자면", "이런", "이러한", "이와", "같은", "요만큼", "요만한", "것",
         "얼마", "안", "되는", "것", "이만큼", "이", "정도의", "이렇게", "많은", "것", "이와",
         "같다", "이때", "이렇구나", "것과", "같이", "끼익", "삐걱", "따위", "와", "같은", "사람들",
         "부류의", "사람들", "왜냐하면", "중의하나", "오직", "오로지", "에", "한하다", "하기만", "하면",
         "도착하다", "까지", "미치다", "도달하다", "정도에", "이르다", "할", "지경이다", "결과에",
         "이르다", "관해서는", "여러분", "하고", "있다", "한", "후", "혼자", "자기", "자기집",
         "자신", "우에", "종합한것과같이", "총적으로", "보면", "총적으로", "말하면", "총적으로", "대로",
         "하다", "으로서", "참", "그만이다", "할", "따름이다", "쿵", "탕탕", "쾅쾅", "둥둥", "봐",
         "봐라", "아이야", "아니", "와아", "응", "아이", "참나", "년", "월", "일", "령", "영",
         "일", "이", "삼", "사", "오", "육", "륙", "칠", "팔", "구", "이천육", "이천칠",
         "이천팔", "이천구", "하나", "둘", "셋", "넷", "다섯", "여섯", "일곱", "여덟", "아홉",
         "령", "영"
     ]

示例#9

0

显示文件

# Twitter 객체의 pos()메소드 이용
# pos(분석할 문장, norm옵션, stem옵션)

from konlpy.tag import Twitter

twitter = Twitter() #트위터 객체 생성
wordList = twitter.pos("친구가 집에 놀러왔다",norm=True,stem=True)
print(wordList)


# 한나눔(Hannanum) 분석기
from konlpy.tag import Hannanum
hannanum = Hannanum()

# analyze() 메소드는 가독성이 떨어진다!
wordList = hannanum.analyze(u'롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다')
print(wordList)

# morphs() 는 한 줄에 구분되는 형태소를 출력. 품사정보 X
wordList = hannanum.morphs(u'롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다')
print(wordList)

nounList = hannanum.nouns(u'다람쥐는 새 쳇바퀴에 타고 싶다')
print(hannanum.pos(u'웃으면 더 행복합니다')

示例#10

0

显示文件

def get_tags(text, ntags=30, multiplier=2):
    t = Twitter()
    nouns = []

示例#11

0

显示文件

文件： analyzer.py 项目： testestzxcv/analyfb_cover

def count_wordfreq(data):  # 형태소 분석 함수
    twitter = Twitter()
    nouns = twitter.nouns(data)  # data에서 명사만 추출하여 변수에 저장

    count = Counter(nouns)  # 빈도수 계산을 위한 사전형태의 데이터 타입
    return count  # 빈도수 반환

示例#12

0

显示文件

文件： test.py 项目： zeroFruit/bookbook.api

import sys, codecs

fileNames = sys.argv

from collections import Counter
from konlpy.tag import Twitter

keywords = []

k = open("dumb.txt")
datak = k.read()
nlpk = Twitter()
dumb_file = nlpk.nouns(datak)


def keywords_extract(filename):
    f = codecs.open('./reviews/' + filename, "r", "utf-8")
    data = f.read()

    nlp = Twitter()
    nouns = nlp.nouns(data)

    for i in nouns:
        if i in dumb:
            nouns.remove(i)
            return nouns

    count = Counter(nouns)
    words = count.most_common(40)

    keyword = words[0:3]

示例#13

0

显示文件

文件： Demo_LDA_score2.py 项目： JeongHyeon-Kim/Extracting-Representative-Keyword-with-LDA-and-Word-Distance

from collections import Counter
from konlpy.tag import Twitter
t = Twitter()
from konlpy.corpus import kolaw
from types import *
import gensim
from gensim.models import LdaModel
from gensim import corpora, models
import MySQLdb
import operator
import decimal
db = MySQLdb.connect(host="localhost",
                     user="******",
                     passwd="kkms1234",
                     db="scraping",
                     charset='utf8')

cursor = db.cursor(MySQLdb.cursors.DictCursor)
cursor2 = db.cursor(MySQLdb.cursors.DictCursor)
cursor3 = db.cursor(MySQLdb.cursors.DictCursor)
cursor4 = db.cursor(MySQLdb.cursors.DictCursor)
cursor5 = db.cursor(MySQLdb.cursors.DictCursor)
cursor6 = db.cursor(MySQLdb.cursors.DictCursor)
cursor7 = db.cursor(MySQLdb.cursors.DictCursor)
cursor8 = db.cursor(MySQLdb.cursors.DictCursor)
cursor9 = db.cursor(MySQLdb.cursors.DictCursor)
cursor10 = db.cursor(MySQLdb.cursors.DictCursor)

cursor.execute("set names utf8")
cursor2.execute("set names utf8")
cursor3.execute("set names utf8")

示例#14

0

显示文件

    while start <= page * 15:
        url = "https://m.search.naver.com/search.naver?where=m_blog&query=" + keyword + "&start=%d" % (
            start)  # blog

        source_code = requests.get(url)
        soup = BeautifulSoup(source_code.text, "lxml")
        for link in soup.find_all("a", class_="total_wrap"):
            temp = link.get("href")
            if "http://m.blog.naver.com" in temp:
                list1.append(temp)
        start += 15
    datastring = ''
print(len(list1))

nlp = Twitter()  # Twitter 라이브러리 사용
list1 = list(set(list1))  # 중복된 url 제거하는 원시적인 코드...
negative = ('아니다', '절대', '검색', '그냥', '듯', '같다', '대부분', '어디서', '그렇다', '전혀')
regex = r'[가-힣, \s ]+'
data = open('reviews_rawdata_4.txt', 'w', encoding='UTF-8')
for url in list1:
    try:
        datastring = ''
        source_code = requests.get(url, timeout=5)
        soup = BeautifulSoup(source_code.text, "lxml")

        # 글 제목 태그 정보
        if soup.find_all("h3", class_="tit_h3"):
            title = soup.find_all("h3", class_="tit_h3")[0]
        else:
            title = soup.find_all("h3", class_="se_textarea")[0]

示例#15

0

显示文件

文件： views.py 项目： taegyu89/GSCIT-sns-sentiment

	def preprocess(self):
		"""
            ACO용 전처리 작업
        """
		kkma = Kkma()
		t = Twitter()
		newArr = []
		sentArr = []
		nounsArr = []
		tokens_ko = []
		index = 0

		self.resultArr = sorted(self.resultArr, key=lambda t: t[0], reverse=False)

		for data in self.resultArr:
			# text preprocess
			text = re.sub(r'@\w+', '', data[1])
			text = re.sub(
				'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '',
				text)
			text = re.sub(r'[\[]|[\]]', '', text)
			text = re.sub(r'[\r]|[\n]', ' ', text)
			text = re.sub(r'[.]|[ㆍ]', '', text)
			text = re.sub(r'#', ' ', text)

			# data[0] = datetime.datetime(int(data[0][0:4]), int(data[0][5:7]),
			# 									  int(data[0][8:10]),
			# 									  int(data[0][11:13]), int(data[0][14:16]),
			# 									  int(data[0][17:19]))

			sentPosArr = kkma.pos(text)
			inArr = []
			for outA in sentPosArr:
				# for inA in outA:
				inArr.append("/".join(outA))

			morph_arr = t.morphs(text)
			morphWords = [word for word in morph_arr if not word in tokens_ko]
			for word in morphWords:
				if not word in nounsArr:
					nounsArr.append(word)

			tokens_ko.extend(morphWords)

			newArr.append({"date":data[0],"sentence": "", "words": morph_arr, "score": 0})

			index = index + 1
			sentArr.append(";".join(inArr))

		index = 0
		for eaSent in sentArr:
			sentiScore = 0
			for corp in settings.KOSAC:
				if eaSent.find(corp['ngram']) > -1:
					if corp['max.value'] == 'NEG':
						sentiScore = sentiScore - float(corp['max.prop'])
					elif corp['max.value'] == 'POS':
						sentiScore = sentiScore + float(corp['max.prop'])

			newArr[index]["sentence"] = self.resultArr[index][1]
			newArr[index]["score"] = sentiScore

			index = index + 1

		self.resultArr2 = newArr

示例#16

0

显示文件

文件： myTextRank.py 项目： cockroach54/textRank

# import sys
# from pprint import pprint
import re
import math, random, csv, networkx as nx, operator
from nltk.collocations import BigramCollocationFinder

# from gensim.summarization import summarize
# from gensim.summarization import keywords
# from gensim.summarization.textcleaner import split_sentences

import requests
from bs4 import BeautifulSoup

# pos tagging, tokenizing
from konlpy.tag import Twitter
tagger = Twitter()
from collections import Counter\

import asyncio
"""
page rank class
"""


class textRank:
    def getNews(self, url):
        self.url = url
        text = requests.get(self.url)
        # print('&&&&&&&', len(text.text))
        soup = BeautifulSoup(text.text, 'html.parser')
        # save news title

示例#17

0

显示文件

文件： views.py 项目： taegyu89/GSCIT-sns-sentiment

	def get(self, request, format=None):
		serializer = AnalyzerSerializer(data=request.query_params)

		if serializer.is_valid():
			formData = serializer.validated_data
			# 작업중 2017.11.15
			# ############################ 			CRAWL			###################################
			# session = request.session.load()

			# Setting
			# setting_path = '{}settings.json'.format(settings.CRAWL_PROJ_PATH)
			# authentication = '{}auth.json'.format(settings.CRAWL_PROJ_PATH)
            #
			# cmd_arr = [settings.GO_CRAWL_CMD, settings.GO_CRAWL_IN_PATH,
			# 		   '-n=' + str(5)]
            #
			# cmd_arr.append('-q={}'.format(formData['instaId']))
			# cmd_arr.append('-l')

			# # subprocess.call(cmd_arr)
			# # try:
			# temp = call(cmd_arr)
			# print(temp)

			temp = self.initializeC(formData['instaId'], 5)

			############################ 			ML			###################################
			# with open("new_data.json", "r") as jf:
			# 	dt = json.load(jf)

			text = formData['text']

			x_arr = []

			t = Twitter()
			vocab_fn = settings.VOCAB_FILENAME.format(settings.ML_VERSION)
			vocab_file = os.path.join(settings.DATA_DIR, vocab_fn)
			jobj = json.loads((open(vocab_file).read()))

			arr = list()
			tokens_ko = t.morphs(text)

			for word in tokens_ko:
				try:
					tmp = jobj[word]
					arr.append(tmp)
				except KeyError:
					pass

			temp_arr = np.asarray(arr)
			x_arr.append(temp_arr)

			x_test = np.asarray(x_arr, dtype=object)

			print('Pad sequences (samples x time)')
			x_test = sequence.pad_sequences(x_test, maxlen=settings.MAX_LENGTH)
			print('x_test shape:', x_test.shape)

			with graph.as_default():
				classes = model.predict(x_test, batch_size=settings.BATCH_SIZE)
				serializer.save()
				return Response({ "lstm":[serializer.data, classes],"aco":temp }, status=status.HTTP_201_CREATED)

		return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)

# class CrawlView(APIView):
# 	parser_classes = (JSONParser,)
# 	authentication_classes = (SessionAuthentication, BasicAuthentication)
# 	permission_classes = (IsAuthenticated,)
#
# 	def get(self, request, format=None):
# 		# print(request.user)
#
# 		crawls = Crawl.objects.all()
# 		serializer = CrawlSerializer(crawls, many=True)
# 		return Response(serializer.data)

# class CrawlSaveView(APIView):
# 	parser_classes = (JSONParser,)
# 	authentication_classes = (SessionAuthentication, BasicAuthentication)
# 	permission_classes = (IsAuthenticated,)
#
# 	def get(self, request, format=None):
#
# 		serializer = CrawlSerializer(data=request.query_params)
#
# 		if serializer.is_valid():
#
# 			# 크롤링 실행
# 			data = serializer.validated_data
#
# 			now = datetime.datetime.now()
# 			# Setting
# 			setting = settings.CRAWL_SETTING
# 			csv_dir_prefix = '{}data'.format(settings.CRAWL_PROJ_PATH)
# 			setting_path = '{}settings.json'.format(settings.CRAWL_PROJ_PATH)
# 			authentication = '{}auth.json'.format(settings.CRAWL_PROJ_PATH)
#
# 			GO_CRAWL_PATH = settings.GO_CRAWL_FB_PATH if data.get('sns_kind') == 'fb' else settings.GO_CRAWL_IN_PATH
#
# 			DB_CURRENT_CNT = 0
#
# 			loop_cnt = int(data.get('number') / 500)
#
# 			# img directory check
# 			img_dir_path = os.path.join(settings.CRAWL_PROJ_PATH, 'img')
# 			if not os.path.exists(img_dir_path):
# 				os.makedirs(img_dir_path)
#
# 			# !! CHANGE FROM DB CONNECTION TO FILE SYSTEM !!
# 			DB_CNT = 0
# 			csv_filename = "{}-explore-{}".format(data.get('sns_kind'), now.strftime("%Y-%m-%d"))
# 			csv_file_loc = os.path.join(csv_dir_prefix, "{}.csv".format(csv_filename))
#
# 			if os.path.exists(csv_file_loc):
# 				DB_CNT = csv_len(csv_file_loc)
# 			else:
# 				with open(csv_file_loc, 'w') as file:
# 					file.writelines("id,img,text,has_tag,write_date,reg_date\n")
#
# 			DB_TOBE_CNT = DB_CNT + data.get('number')
#
# 			while DB_TOBE_CNT > DB_CURRENT_CNT:
#
# 				cmd_arr = [settings.GO_CRAWL_CMD, GO_CRAWL_PATH,
# 						   '-d=' + csv_file_loc,
# 						   '-t=' + data.get('crawl_type'),
# 						   '-n=' + str(500),
# 						   '-a=' + authentication,
# 						   '-s=' + setting_path,
# 						   '-e=' + data.get('env')]
#
# 				if data.get('query') != "":
# 					cmd_arr.append('-q={}'.format(data.get('query')))
# 				elif data.get('random'):
# 					cmd_arr.append('-r')
#
# 				cmd_arr.append('-l')
#
# 				print(cmd_arr)
# 				# subprocess.call(cmd_arr)
# 				# try:
# 				call(cmd_arr)
# 				# except TimeoutExpired as e:
# 				# 	continue
# 				# finally:
# 				# DB_CURRENT_CNT = collection.find({}).count()
# 				DB_CURRENT_CNT = csv_len(csv_file_loc)
#
# 			serializer.save()
# 			return Response(serializer.data, status=status.HTTP_201_CREATED)
# 		return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)

# class CrawlMonitorView(APIView):
# 	parser_classes = (JSONParser,)
# 	authentication_classes = (SessionAuthentication, BasicAuthentication)
# 	permission_classes = (IsAuthenticated,)
#
# 	def get(self, request, format=None):
#
# 		data = request.query_params
#
# 		now = datetime.datetime.now()
# 		nowDate = now.strftime("%Y%m%d")
#
#
# 		filename = 'logs/log-' + data.get('env') + '.' + nowDate + '.log'
# 		filename = os.path.join(settings.DIR_PREFIX, filename)
#
# 		lines = []
# 		startNum = 0
#
# 		if not data.get('startNum'):
#
# 			with open(filename) as fp:
# 				for i, line in enumerate(fp):
# 					lines.append({'num':i,'text':line})
# 		else:
# 			startNum = int(data.get('startNum'))
#
# 			with open(filename) as fp:
# 				for i, line in enumerate(fp):
# 					if i > startNum:
# 						lines.append({'num': i, 'text': line})
#
# 		endNum = len(lines) + startNum
#
# 		return Response({'log':{
# 			'startNum':startNum,
# 			'endNum':endNum,
# 			'lines':lines
# 		}}, status=status.HTTP_200_OK)
#
# class CrawlCSVDataView(APIView):
# 	parser_classes = (JSONParser,)
# 	authentication_classes = (SessionAuthentication, BasicAuthentication)
# 	permission_classes = (IsAuthenticated,)
#
# 	def get(self, request, format=None):
#
# 		data = request.query_params
#
# 		now = datetime.datetime.now()
#
# 		csv_dir_prefix = '{}data'.format(settings.CRAWL_PROJ_PATH)
# 		csv_filename = "{}-explore-{}".format(data.get('sns_kind'), now.strftime("%Y-%m-%d"))
# 		csv_file_loc = os.path.join(csv_dir_prefix, "{}.csv".format(csv_filename))
#
#
# 		lines = []
# 		startNum = 0
#
# 		if not data.get('startNum'):
#
# 			with open(csv_file_loc) as fp:
# 				for i, line in enumerate(fp):
# 					lines.append({'num': i, 'text': line})
# 		else:
# 			startNum = int(data.get('startNum'))
#
# 			with open(csv_file_loc) as fp:
# 				for i, line in enumerate(fp):
# 					if i > startNum:
# 						lines.append({'num': i, 'text': line})
#
# 		endNum = len(lines) + startNum
#
# 		return Response({'csv': {
# 			'startNum': startNum,
# 			'endNum': endNum,
# 			'lines': lines
# 		}}, status=status.HTTP_200_OK)

示例#18

0

显示文件

文件： csv_read.py 项目： Rnlcksgdkd/Project_AI

def build_dataset(train_text, min_count=0, sampling_rate=0):
    words = list()
    for line in train_text:
        sentence = re.sub(r"[^ㄱ-힣a-zA-Z0-9]+", ' ', line).strip().split()
        if sentence:
            words.append(sentence)

    word_counter = [['UNK', -1]]
    word_counter.extend(
        collections.Counter([word for sentence in words
                             for word in sentence]).most_common())
    word_counter = [
        item for item in word_counter
        if item[1] >= min_count or item[0] == 'UNK'
    ]

    word_list = list()
    word_dict = dict()
    for word, count in word_counter:
        word_list.append(word)  # 학습에 사용된 word를 저장한다. (visualize를 위해)
        word_dict[word] = len(word_dict)
    word_reverse_dict = dict(zip(word_dict.values(), word_dict.keys()))

    word_to_pos_li = dict()
    pos_list = list()
    twitter = Twitter()
    for w in word_dict:
        w_pos_li = list()
        for pos in twitter.pos(w, norm=True):
            w_pos_li.append(pos)

        word_to_pos_li[word_dict[w]] = w_pos_li
        pos_list += w_pos_li

    pos_counter = collections.Counter(pos_list).most_common()

    pos_dict = dict()
    for pos, _ in pos_counter:
        pos_dict[pos] = len(pos_dict)

    pos_reverse_dict = dict(zip(pos_dict.values(), pos_dict.keys()))

    word_to_pos_dict = dict()

    for word_id, pos_li in word_to_pos_li.items():
        pos_id_li = list()
        for pos in pos_li:
            pos_id_li.append(pos_dict[pos])
        word_to_pos_dict[word_id] = pos_id_li

    data = list()
    unk_count = 0
    for sentence in words:
        s = list()
        for word in sentence:
            if word in word_dict:
                index = word_dict[word]
            else:
                index = word_dict['UNK']
                unk_count += 1
            s.append(index)
        data.append(s)
    word_counter[0][1] = max(1, unk_count)

    # data = sub_sampling(data, word_counter, word_dict, sampling_rate)

    return data, word_dict, word_reverse_dict, pos_dict, pos_reverse_dict, word_to_pos_dict, word_list

示例#19

0

显示文件

文件： IDF.py 项目： JeongHyeon-Kim/Extracting-Representative-Keyword-with-LDA-and-Word-Distance

from collections import Counter
from konlpy.tag import Twitter
t = Twitter()
from konlpy.corpus import kolaw
from types import *
import gensim
from gensim.models import LdaModel
from gensim import corpora, models
import MySQLdb
import operator
import decimal
import math
db = MySQLdb.connect(host="localhost",
                     user="******",
                     passwd="kkms1234",
                     db="scraping",
                     charset='utf8')

cursor = db.cursor(MySQLdb.cursors.DictCursor)
cursor2 = db.cursor(MySQLdb.cursors.DictCursor)

cursor.execute("set names utf8")
cursor2.execute("set names utf8")
db.query("set character_set_connection=utf8;")
db.query("set character_set_server=utf8;")
db.query("set character_set_client=utf8;")
db.query("set character_set_results=utf8;")
db.query("set character_set_database=utf8;")

Articlenumber = []
Word = {}

示例#20

0

显示文件

import time
from konlpy.tag import Kkma, Twitter, Komoran, Hannanum

# texts = ['관리하시고 치료하시는 거예요.', '롯데마트의 흑마늘 양념 치킨이 만약에 논란이 되고 있다.']
texts = [
    '가장 좋은', '전 상담원 통화중', '전혀', '단 한번', '더', '덜', '방송에서만', '봉제선이 없는', '실크와 같은'
]

pos_taggers = [('Komoran', Komoran()), ('kkma', Kkma()),
               ('twitter', Twitter()), ('Hannanum', Hannanum())]
results = []
for name, tagger in pos_taggers:
    tokens = []
    process_time = time.time()
    for text in texts:
        result = tagger.pos(text)
        print(name, "POS", result)
        tokens.append(result)

        result = tagger.morphs(text)
        print(name, "MORPHS", result)
        tokens.append(result)

        result = tagger.nouns(text)
        print(name, "NOUNS", result)
        tokens.append(result)

    process_time = time.time() - process_time
    print('tagger name = %10s, %.3f secs' % (name, process_time))
    results.append(tokens)
    print('----------------------------------')

示例#21

0

显示文件

# 그래프에서 한끌 깨짐 방지
mpl.rcParams['axes.unicode_minus'] = False
path = '/Library/Fonts/NanumGothic.ttf'
font_name = fm.FontProperties(fname=path, size=10)

# 파일이름 : 1993 한국증권학회 : 심포지엄 Sheet / C 발표주제 > 1993_ko.xlsx, rows = 3:430
# 파일이름 : 재무관리학회_학회지(완료) : 재무관리연구 Sheet / C 논문제목 > jaemoo.xlsx
# 파일이름 : 파생상품학회_학회지외 (1)(완료) : 파생상품학회_심포지엄 Sheet / C 논문제목 > product.xlsx - 파생상품학회_선물연구 Sheet로
# 파일이름 : 재무학회 현황(재무연구, 심포지엄, 학술포럼) : 재무학회-심포지엄 Sheet / c 발표주제 > money.xlsx
# 파일이름 : 공동학술대회 자료 : 심포지엄과 발표논문 2개의 sheet 모두/ 심포지엄 sheet는 c 발표주제 + 발표논문 sheet는 b 논문제목 > coacademic.xlsx
'''
파생상품학회_학회지외 파일의 파생상품학회_심포지엄 시트에 논문제목 컬럼이 존재하지 않아 파생상품학회_선물연구에서 논문제목 컬럼을 추출하여 분석했습니다.
'''
dict_data = {}
tw = Twitter()
FILE_PATH = "/Users/moonseongjae/Downloads/coacdemic.xlsx"
SAVE_PATH = "/Users/moonseongjae/python-worksapce/Project/crawling_test/result.xlsx"
try:
    wb = load_workbook(FILE_PATH)
    content_list = []
    sheet = wb['공동학술대회-발표논문']  # 공동학술대회-발표논문
    for i in range(3, 1341):  # 1341
        content = sheet["B" + str(i)].value
        if content is not None:
            content_list.append(content)
            # print(content)

    for alt in content_list:
        match_pattern = re.findall('[a-zA-Z]{3,15}', alt)
        temp = tw.pos(alt, norm=True)

示例#22

0

显示文件

文件： tests.py 项目： gbnam/diquest_demo

from konlpy.tag import Kkma, Twitter

# sentence = ['안녕하세요', '저는 김호근입니다', '반갑습니다']
# keyword = ['안녕', '저는', '김호근']
# print(sentence)
# print(keyword)

_kkma = Kkma()
_twt = Twitter()
text = "안녕하세요 저는 김호근 입니다."

print(text)
print(_kkma.nouns(text))
print(_twt.nouns(text))

示例#23

0

显示文件

文件： Twitter_nouns_0517.py 项目： seoulblanc/NaverStoreData

# 명사 빈도수 체크 (작성중)

# -*- coding: utf-8 -*-

import pandas as pd
from collections import Counter
from konlpy.tag import Twitter
from konlpy.utils import pprint
from collections import Counter

data = pd.read_csv("C:/naverstore/living_data1.csv")

nlp = Twitter()

text = ''
data01 = data[data['prefer'] == '만족']

for item in data01['content']:
    try:
        text += item
    except Exception as e:
        print(e)

nouns = nlp.nouns(text)
count = Counter(nouns)
print('만족한 사람들 반응')
print(count)

wordinfo = dict()
for tags, counts in count.most_common(200):
    if (len(str(tags)) > 1):

示例#24

0

显示文件

文件： emotion_matching.py 项目： ALICE-Natural-Language-Processing-Lab/textual-emotion-recognition-from-Korean

# -*- coding: utf-8 -*-

import numpy as np
import sys
import codecs

from konlpy.tag import Twitter
konlpy_twitter = Twitter()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Read base data.
train_text = []
train_labels = []
for line in codecs.open('./base_data.tsv', 'r', 'utf-8'):
    label, text = line.strip().split('\t')
    text = ' '.join(konlpy_twitter.morphs(text))
    train_text.append(text)
    train_labels.append(label)

for line in codecs.open('./test_data.tsv', 'r', 'utf-8'):
    label, text = line.strip().split('\t')
    text = ' '.join(konlpy_twitter.morphs(text))
    train_text.append(text)
    train_labels.append(label)

# Read sample emotion data for test.
origin_text = []
test_text = []

示例#25

0

显示文件

文件： fw_chatbotminer.py 项目： simfunnywork/simfunnywork

dff.head(100)


# <a id='the_destination5'></a>
# ## 2.2 명사 

# <a id='the_destination6'></a>
# ### 2.2.1 명사 추출

# In[56]:

brother_tae_change = str(list(df['text'])) ###################형태 변환

import nltk
from konlpy.tag import Twitter
t = Twitter()

noun_comehere = t.nouns(brother_tae_change)   ################명사 추출
noun_comehere


# <a id='the_destination7'></a>
# <b> 한 자리수 이상의 명사 추출 </b>

# In[57]:

noun_comehere1 = [noun_comehere for noun_comehere in noun_comehere if len(noun_comehere) > 1 ]
noun_comehere1


# <a id='the_destination8'></a>

示例#26

0

显示文件

def tokenize_noun(doc):
    pos_tagger = Twitter()
    return pos_tagger.nouns(doc)

示例#27

0

显示文件

文件： test8.py 项目： kmin1318/CS408_github

# -*- coding: utf-8 -*-

import datetime

from konlpy.tag import Twitter

twitter = Twitter()


class EmotifyElement:
    def __init__(self):
        self.Happy = 0
        self.Sad = 0
        self.Surprise = 0
        self.Anger = 0
        self.HappyKeyword = {}
        self.SadKeyword = {}
        self.SurpriseKeyword = {}
        self.AngerKeyword = {}

    def modHappy(self, _happy):
        self.Happy = _happy

    def modSad(self, _sad):
        self.Sad = _sad

    def modSurprise(self, _surprise):
        self.Surprise = _surprise

    def modAnger(self, _anger):
        self.Anger = _anger

示例#28

0

显示文件

文件： ldareviewww2_iter1000_20.py 项目： JeongHyeon-Kim/Extracting-Representative-Keyword-with-LDA-and-Word-Distance

from konlpy.tag import Twitter; t = Twitter()
from konlpy.corpus import kolaw
import nltk
import gensim
from gensim.models import LdaModel
from gensim import corpora,models
import nltk
import MySQLdb
import xlwt
db = MySQLdb.connect(host="localhost", user ="******", passwd="kkms1234", db="scraping", charset='utf8')

cursor = db.cursor(MySQLdb.cursors.DictCursor)
cursor.execute("set names utf8")

db.query("set character_set_connection=utf8;")
db.query("set character_set_server=utf8;")
db.query("set character_set_client=utf8;")
db.query("set character_set_results=utf8;")
db.query("set character_set_database=utf8;")

cursor.execute("set names utf8")
sql = "select * from Text3 where ArticleNumber<=10000"
cursor.execute(sql.encode('utf8'))

rows = cursor.fetchall()
document = ''

#koreanStopWord = kolaw.open('stopword.txt').read()

workbook = xlwt.Workbook()

示例#29

0

显示文件

def count_wordfreq(data):
    twitter = Twitter()
    nouns = twitter.nouns(data)

    count = Counter(nouns)
    return count

示例#30

0

显示文件

文件： emotion_dfo.py 项目： Jelly6489/PythonFlask

    def data_pro(self, keyword):
        print('-----------emotionDfo--------------')

        word = []
        positive_word = []
        negative_word = []
        noun_list = []
        poflag = []
        neflag = []

        po_key = []
        ne_key = []
        po_val = []
        ne_val = []

        # file = open('{}.csv'.format(keyword), 'r', encoding='utf-8-sig')
        file = open('./csv/{}.csv'.format(keyword), 'r', encoding='utf-8-sig')

        lists = file.readlines()
        file.close()

        # print(f'lists : {lists}')

        twitter = Twitter()
        morphs = []

        for sentence in lists:
            morphs.append(twitter.pos(sentence))

        pos = codecs.open('./text/positive_words_self.txt',
                          'rb',
                          encoding='utf-8-sig')

        while True:
            line = pos.readline()
            line = line.replace('\r\n', '')
            positive_word.append(line)
            # print('==========확인1==========')
            # print(f'pos line : {line}')

            if not line: break
        pos.close()

        neg = codecs.open('./text/negative_words_self.txt',
                          'rb',
                          encoding='utf-8-sig')

        while True:
            line = neg.readline()
            line = line.replace('\r\n', '')
            negative_word.append(line)
            # print('==========확인2==========')
            # print(f'neg line : {line}')

            if not line: break
        neg.close()

        # print(f'positive_word : {positive_word}')
        # print(f'negative_word : {negative_word}')

        for sentence in morphs:
            for word, text_tag in sentence:
                if text_tag in ['Noun']:
                    noun_list.append(word)
                    for x in positive_word:
                        if x == word:
                            # print(f'append poflag. word : {word}')
                            poflag.append(x)

                    for y in negative_word:
                        if y == word:
                            # print(f'append neflag. word : {word}')
                            neflag.append(y)
                '''
                #         print("부정적 :", y)
                # if text_tag in ['Noun'] and ("것" not in word) and ("내" not in word) and ("첫" not in word) and \
                #     ("나" not in word) and ("와" not in word) and ("식" not in word) and ("수" not in word) and \
                #     ("게" not in word) and ("말" not in word):
                #      noun_list.append(word)
                    
                # if text_tag in ['Noun'] and ("갑질" not in word) and ("논란" not in word) and ("폭리" not in word) and \
                #     ("허위" not in word) and ("과징금" not in word) and ("눈물" not in word) and ("피해" not in word) and \
                #     ("포화" not in word) and ("우롱" not in word) and ("위반" not in word) and ("리스크" not in word) and \
                #     ("사퇴" not in word) and ("급락" not in word) and ("하락" not in word) and ("폐업" not in word) and \
                #     ("불만" not in word) and ("산재" not in word) and ("닫아" not in word) and ("손해배상" not in word) and \
                #     ("구설수" not in word) and ("적발" not in word) and ("침해" not in word) and ("빨간불" not in word) and \
                #     ("취약" not in word) and ("불명예" not in word) and ("구형" not in word) and ("기소" not in word) and \
                #     ("반토막" not in word) and ("호소" not in word) and ("불매" not in word) and ("냉담" not in word) and \
                #     ("문제" not in word) and ("직격탄" not in word) and ("한숨" not in word) and ("불똥" not in word) and \
                #     ("항의" not in word) and ("싸늘" not in word) and ("일탈" not in word) and ("파문" not in word) and \
                #     ("횡령" not in word) and ("사과문" not in word) and ("여파" not in word) and ("울상" not in word) and \
                #     ("초토화" not in word) and ("급감" not in word) and ("우려" not in word) and ("중단" not in word) and \
                #     ("퇴출" not in word) and ("해지" not in word) and ("일베" not in word) and ("이물질" not in word) and \
                #     ("엉망" not in word) and ("소송" not in word) and ("하락" not in word) and ("매출하락" not in word) and \
                #     ("혐의" not in word) and ("부채" not in word) and ("과징금" not in word) and ("포기" not in word) and \
                #     ("약세" not in word) and ("최악" not in word) and ("손실" not in word) and ("의혹" not in word):
                #     positive_word.append(word)

                # elif text_tag in ['Noun'] and ("MOU" not in word) and ("제휴" not in word) and ("주목" not in word) and \
                #     ("호응" not in word) and ("돌파" not in word) and ("이목" not in word) and ("수상" not in word) and \
                #     ("입점" not in word) and ("인기" not in word) and ("열풍" not in word) and ("진화" not in word) and \
                #     ("대박" not in word) and ("순항" not in word) and ("유치" not in word) and ("1위" not in word) and \
                #     ("출시" not in word) and ("오픈" not in word) and ("돌풍" not in word) and ("인싸" not in word) and \
                #     ("줄서서" not in word) and ("대세" not in word) and ("트렌드" not in word) and ("불티" not in word) and \
                #     ("진출" not in word) and ("체결" not in word) and ("증가" not in word) and ("기부" not in word) and \
                #     ("신제품" not in word) and ("신상" not in word) and ("최고" not in word) and ("새로운" not in word) and \
                #     ("착한" not in word) and ("신기록" not in word) and ("전망" not in word) and ("협력" not in word) and \
                #     ("역대" not in word) and ("상승" not in word) and ("늘어" not in word) and ("승인" not in word):
                #     negative_word.append(word)
                '''
        ##
        # print(f'poflag : {poflag}')
        # print(f'neflag : {neflag}')
        ##
        count_po = Counter(poflag)
        count_ne = Counter(neflag)
        po_words = dict(count_po.most_common())
        ne_words = dict(count_ne.most_common())

        # 워드클라우드로 명사만 추출
        '''
        ['창립', '주년', '삼성', '전자', '이건희', '회장', '도전', '혁신', '삼성', '전자', '삼성', '포럼', '개최', '김기남', '대표', 
        '핵심', '기술', '발전', '현', '코스피', '코스닥', '장', '동반', '상승', '덕성', '시스', '웍', '한국', '컴퓨터', '삼성', '전자
        ', '창립', '주년', '기념', '개최', '이재용', '부회장', '불참', '롯데', '하이마트', '온라인', '오늘', '역대', '빅', '하트', ' 
        일', '시작', '손연기', '칼럼', '차', '산업혁명', '시대', '문제', '일자리', '삼성', '전자', '모바일', '신제품', '엑시노스', ' 
        ...
        '멘토', '체험', '활동', '김기남', '삼성', '부회장', '로', '코로나', '해결', '위해', '전세계', '연구자', '협력', '순위', '주식
        ', '부자', '위', '눈앞', '이재용', '뉴', '파워', '프라', '마', '규모', '유상증자', '결정', '삼성', '전자', '창립', '주념', ' 
        기념', '회장', '도전', '혁신', '계승', '삼성', '전자', '창립', '주년', '기념', '개최']
        '''

        po_key = po_words.keys()
        po_val = po_words.values()

        ne_key = ne_words.keys()
        ne_val = ne_words.values()

        print("\n긍정적인 단어 :", po_key, po_val)
        print("부정적인 단어 :", ne_key, ne_val)

        po_df = pd.DataFrame(list(po_words.items()), columns=['tag', 'weight'])
        ne_df = pd.DataFrame(list(ne_words.items()), columns=['tag', 'weight'])
        po_df.loc[:, 'type'] = "P"
        ne_df.loc[:, 'type'] = "N"

        df = pd.concat([po_df, ne_df], axis=0)

        df.loc[:, 'keyword'] = keyword
        df.rename(columns={'Unnamed: 0': 'name'}, inplace=True)
        df.fillna(0, inplace=True)
        print(df.head())
        df.to_csv('./csv/{}_word.csv'.format(keyword), encoding='utf-8-sig')
        '''
        긍정적인 단어 : {'상승': 141, '인기': 66, '출시': 60, '전망': 36, '오픈': 30, 
                        '돌파': 19, '트렌드': 12, '체결': 12, '증가': 12, '역대': 11, '협력': 11, 
                        '주목': 11, '미소': 8, '기부': 8, '승인': 6, '최고': 6, '대세': 5, '유치': 4, 
                        '수상': 4, '불티': 2, '부상': 2, '순항': 2, '호응': 1, '진출': 1}
        부정적인 단어 : {'급감': 233, '여파': 163, '하락': 162, '피해': 115, 
                        '직격탄': 83, '논란': 61, '중단': 41, '손실': 39, '반토 막': 34, '최악': 33, 
                        '포기': 32, '폐업': 25, '급락': 25, '우려': 24, '불매': 14, '눈물': 13, 
                        '매각': 10, '호소': 9, '울상': 7, '문제': 6, '불만': 6, '약세': 5, '한숨': 5, 
                        '일베': 4, '해지': 4, '초토화': 3, '참혹': 3, '폐점': 2, '파문': 2, 
                        '과징금': 2, '항의': 1, '소송': 1, '불명예': 1, '리스크': 1, '갑질': 1, 
                        '침해': 1, '발끈': 1}
        '''
        print('---------------EmotionDfo Success----------------')
        return df