def extract_content():

    DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8'

    session = db_session(DB)
    M = db_model(DB, 'corpus')

    query = session.query(M)

    while True:
        corpuses = query.filter(M.status == 'ready').order_by(M.id).limit(30).all()
        if not corpuses:
            break

        for corpus in corpuses:
            try:
                summary_html = Document(corpus.html).summary(html_partial=True)
                content = BS(summary_html).text.strip()
                corpus.content = content
                session.commit()
            except:
                corpus.content = '[extract_error]'
                session.commit()
                print('===> extract_content error, id: ', corpus.id)

            corpus.status = 'extracted'
            session.commit()
def export_csv():

    DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8'

    session = db_session(DB)
    M = db_model(DB, 'corpus')

    query = session.query(M)

    csv_file = open(csvfile_path, 'wb')
    writer = csv.writer(csv_file)

    table_head = ['url', 'website', 'published_at', 'word_freq', 'topic']
    writer.writerow(table_head)


    offset = 0
    limit  = 3000

    while True:

        corpuses = query.filter(M.status == 'marked').order_by(M.id).offset(offset).limit(limit).all()

        if not corpuses:
            break

        for corpus in corpuses:
            table_row = [corpus.url, corpus.website, corpus.published_at, corpus.word_freq, corpus.topic]
            writer.writerow(table_row)
            print('===> write id: ', corpus.id)

        offset += limit

    csv_file.close()
def mark_topic():

    DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8'

    session = db_session(DB)
    M = db_model(DB, 'corpus')

    query = session.query(M)

    while True:
        corpuses = query.filter(M.status == 'segmented').order_by(M.id)
        if not corpuses:
            break

        for corpus in corpuses:
            try:
                word_freq = json.loads(corpus.word_freq)

                topic = 0

                if check_topic1(word_freq):
                    topic = topic | TOPIC1

                if check_topic2(word_freq):
                    topic = topic | TOPIC2

                if check_topic3(word_freq):
                    topic = topic | TOPIC3

                if check_topic4(word_freq):
                    topic = topic | TOPIC4

                if check_topic5(word_freq):
                    topic = topic | TOPIC5

                if check_topic6(word_freq):
                    topic = topic | TOPIC6

                if check_topic7(word_freq):
                    topic = topic | TOPIC7

                if check_topic8(word_freq):
                    topic = topic | TOPIC8

                if check_topic9(word_freq):
                    topic = topic | TOPIC9

                if check_topic10(word_freq):
                    topic = topic | TOPIC10

                corpus.topic = topic
                session.commit()

                print('===> mark topic, id: ', corpus.id)

            except:
                print('===> mark topic error, id: ', corpus.id)

            corpus.status = 'marked'
            session.commit()
def mark_topic():

    DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8'

    session = db_session(DB)
    M = db_model(DB, 'corpus')

    query = session.query(M)

    while True:
        corpuses = query.filter(M.status == 'segmented').order_by(M.id)
        if not corpuses:
            break

        for corpus in corpuses:
            try:
                word_freq = json.loads(corpus.word_freq)

                topic = 0

                if check_topic1(word_freq):
                    topic = topic | TOPIC1

                if check_topic2(word_freq):
                    topic = topic | TOPIC2

                if check_topic3(word_freq):
                    topic = topic | TOPIC3

                if check_topic4(word_freq):
                    topic = topic | TOPIC4

                if check_topic5(word_freq):
                    topic = topic | TOPIC5

                if check_topic6(word_freq):
                    topic = topic | TOPIC6

                if check_topic7(word_freq):
                    topic = topic | TOPIC7

                if check_topic8(word_freq):
                    topic = topic | TOPIC8

                if check_topic9(word_freq):
                    topic = topic | TOPIC9

                if check_topic10(word_freq):
                    topic = topic | TOPIC10

                corpus.topic = topic
                session.commit()

                print('===> mark topic, id: ', corpus.id)

            except:
                print('===> mark topic error, id: ', corpus.id)

            corpus.status = 'marked'
            session.commit()
def segment():

    DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8'

    session = db_session(DB)
    M = db_model(DB, 'corpus')

    query = session.query(M)

    #while True:
    corpuses = query.filter(M.status == 'extracted', M.content != '[extract_error]').order_by(M.id)
        # if not corpuses:
        #     break

    for corpus in corpuses:
        try:
            corpus.word_freq = segment_text(corpus.content)
            session.commit()
            print('===> segment id: ', corpus.id)
        except:
            print('===> segment error, id: ', corpus.id)

        corpus.status = 'segmented'
        session.commit()
Exemplo n.º 6
0
 def open_spider(self, spider):
     self.session = session = db_session(DB)
     self.model = db_model(DB, 'corpus')
     self.query = self.session.query(self.model)