예제 #1
0
def main():
    with open("./emrData/liran/2258.txt", 'r') as f:
        text = f.read()
    print(text)
    text = remove_special_chars(text)
    wordseg = list(jieba.cut(text, cut_all=False, HMM=False))
    print("默认词典: " + " ".join(wordseg))

    # -------------- 自定义词库达到更改默认分词的目的 -------------------------
    jieba.load_userdict("./models_v3.4.0/lexicon.txt")
    # jieba.set_dictionary('filename') # 重新设置词典
    wordseg = list(jieba.cut(text, cut_all=False, HMM=False))
    print("用户词典: " + " ".join(wordseg))

    # ------------------------- 去掉停用词 -------------------------
    stopwords = load_stopwords('./models_v3.4.0/stopWords.txt')  # 加载停用词列表
    wordseg_filtered = remove_stop_words(wordseg, stopwords)
    print("过滤停用词:", wordseg_filtered)

    # --------------- 词性标注 -------------------------
    wordseg_with_postag = pseg.cut(text)
    print("词性标注:",
          "/".join(["%s %s" % (w.word, w.flag) for w in wordseg_with_postag]))

    nltk.download('averaged_perceptron_tagger')
    nltk.download('maxent_ne_chunker')
    nltk.download('words')
    tagged = nltk.pos_tag(wordseg)  # 词性标注
    print("tagged", tagged)
예제 #2
0
 def __init__(self, file_stopwords, file_word2vec_bin, file_sent2vec,
              file_doc2vec):
     self.stop_words = load_stopwords(file_stopwords)
     self.file_word2vec_bin = file_word2vec_bin
     self.file_sent2vec = file_sent2vec
     self.file_doc2vec = file_doc2vec
     pass
예제 #3
0
    def __init__(self, N=1):
        self.N = N
        # 加载禁用词集合
        self.stopwords = load_stopwords()

        self.gram2id = {}
        self.id2gram = {}
        self.length = 0
        self.gram2count = {}
예제 #4
0
def run_init():
    """初始化检索系统
    """
    print('开始初始化检索系统...')
    data = load_dataset()

    # 初始化字段
    stoplist = list(load_stopwords())
    analyzer = ChineseAnalyzer(stoplist=stoplist)
    schema = Schema(pid=ID(stored=True),
                    topic=ID(stored=True),
                    method=ID(stored=True),
                    context=TEXT(stored=True, analyzer=analyzer),
                    response=TEXT(stored=True, analyzer=analyzer))

    # 创建索引文件保存目录
    if not os.path.exists(CONFIG.get('IR_DIR')):
        os.mkdir(CONFIG.get('IR_DIR'))
    idx = create_in(CONFIG.get('IR_DIR'), schema)

    # 构建索引
    print('开始构建索引...')
    count = {}
    writer = idx.writer()
    # 所有主题
    for topic in data:
        # 特定主题下所有帖子
        for sess in tqdm(data.get(topic)):
            pid = sess.get('pid')
            all_pairs = preprocess(sess)
            # 添加两种类型对话对 (context, response)
            for method, pairs in all_pairs.items():
                for i, pair in enumerate(pairs):
                    writer.add_document(topic=topic,
                                        method=method,
                                        context=pair[0],
                                        response=pair[1],
                                        pid=f'{pid}-{method}-{i}')
                    count[method] = count.get(method, 0) + 1

    print('开始写入索引...')
    writer.commit()
    print(f'初始化成功,写入情况如下:')
    print(count)
예제 #5
0
    def __init__(self, model_file='./model/lda_model.model'):
        self.stopwords = utils.load_stopwords()
        self.num_topics = 50

        if os.path.isfile('./data/dictionary.pkl'):
            with open('./data/dictionary.pkl', 'rb') as f:
                self.dictionary = pickle.load(f)

            with open('./data/tfidf.pkl', 'rb') as f:
                self.tfidf = pickle.load(f)

        else:
            self.dictionary, self.tfidf = self.create_dictionary()

        if os.path.isfile(model_file):
            self.model = LdaModel.load(model_file)

        else:
            self.model = self.train(path_save=model_file)
예제 #6
0
    def __init__(self, collections, topic):

        self.cluster = pymongo.MongoClient(
            'mongodb://localhost:50082/',
            unicode_decode_error_handler='ignore')
        self.db = self.cluster.COVID2020

        if collections == "v2":
            self.collections_tweet = self.db.COVID2020_v2
        if collections == "v3":
            self.collections_tweet = self.db.COVID2020_v3
        if collections == "v4":
            self.collections_tweet = self.db.COVID2020_v4
        if collections == "v5":
            self.collections_tweet = self.db.COVID2020_v5
        if topic == "1":
            self.topic = [
                x.strip()
                for x in open("./../topic_list/1.txt", "r").readlines()
            ]
        if topic == "2":
            self.topic = [
                x.strip()
                for x in open("./../topic_list/2.txt", "r").readlines()
            ]
        if topic == "3":
            self.topic = [
                x.strip()
                for x in open("./../topic_list/3.txt", "r").readlines()
            ]
        if topic == "depression":
            self.topic = [
                x.strip()
                for x in open("./../depression_list/depression_list.txt",
                              "r").readlines()
            ]

        self.stopwords = utils.load_stopwords()
        folder = input("Which database? depression or topic?: ")
        self.path = "./../data/database/" + str(folder) + "/"
        #     f.close()

    print '\t' + system_name, param_id


domain = 'meeting'  # meeting
dataset_id = 'ami'  # ami, icsi
language = 'en'  # en, fr
development_or_test = 'development'  # development / test

# #########################
# ### RESOURCES LOADING ###
# #########################
if domain == 'meeting':
    path_to_stopwords = path_to_root + 'resources/stopwords/meeting/stopwords.' + language + '.dat'
    stopwords = utils.load_stopwords(path_to_stopwords)

    if dataset_id == 'ami':
        ids = meeting_lists.ami_development_set \
            if development_or_test == 'development' \
            else meeting_lists.ami_test_set
    elif dataset_id == 'icsi':
        ids = meeting_lists.icsi_development_set \
            if development_or_test == 'development' \
            else meeting_lists.icsi_test_set

if language == 'en':
    path_to_wv = path_to_root + 'resources/GoogleNews-vectors-negative300.bin.gz'
    path_to_lm = path_to_root + 'resources/en-70k-0.2.lm'

# Load Word2Vec (takes approx. 8G RAM)
예제 #8
0
# Alternatives
# Controller_object (Controller)
# Want_suspect (Want)
# Building_subparts (Building)
# Hedging
# Be_in_agreement_on_action (agreement)
# Disgraceful_situation (situation)
# Change_event_duration (event)
# Intentional_deception (deception)

ANNOTATIONS_PATH = 'annotations.csv'
STOPWORDS_PATH = 'stop_words_FULL.txt'

if __name__ == '__main__':
    annotations = read_annotations(ANNOTATIONS_PATH)
    stopw = load_stopwords(STOPWORDS_PATH)

    res_rows = []

    giuste = 0
    for frame, word, target_synset in annotations:
        # Se non è disponibile il mapping
        if target_synset is None:
            giuste += 1
            continue

        # Rimuovo il PoS dalla parola estratta dalle annotazioni
        input_word = word.split('.')[0]
        # disambiguo la parola input_word (quella centrale
        # nelle annotazioni) usando il frame come contesto
        synset = lesk_disambiguate(frame, input_word, stopwordset=stopw)
예제 #9
0
# -*- coding: utf-8 -*-

import json
import pandas as pd
from flask import Flask, request
from hierarchical_clustering import HierarchicalClustering
from distance_measures import lcs_distance
from utils import preprocess, load_stopwords

app = Flask(__name__)
stopwords = load_stopwords('stopwords.txt')
models = {}


@app.route('/')
def hello():
    return 'hello'


@app.route('/create/<model_name>')
def create(model_name):
    app.logger.info('model {} has been created'.format(model_name))
    model = HierarchicalClustering(model_name, lcs_distance, 0.7, True)
    models[model_name] = model
    return 'creation finished'


@app.route('/clear/<model_name>')
def clear(model_name):
    model = models[model_name]
    model.clear()
예제 #10
0
from __future__ import division

import math
import MySQLdb
import numpy as np

from numpy.linalg import norm
from collections import Counter

from utils import to_csv, load_stopwords
from textparser import word_tokenize, tfidf

# Will search in CWD, PYTHONPATH and PATH
stopwords = load_stopwords('data/stopwords.txt')


class SearchResult(object):
    def __init__(self, page_id, page_name, vector, weight):
        self.page_id = page_id
        self.page_name = page_name
        self.vector = vector
        self.weight = weight
        self.incoming = None
        self.outgoing = None

    def __repr__(self):
        return '%s (%d): %f' % (self.page_name, self.page_id, self.weight)


class WikiIndex(object):
    """
예제 #11
0
            n_components=TopicNum,
            max_iter=1200,
            learning_method='batch',
            n_jobs=-1,
            doc_topic_prior=alpha,
            topic_word_prior=beta,
            verbose=1,
        )
        lda_feature = lda.fit_transform(X)
        with open(datapath + 'lda_model.pkl', 'wb') as f:
            pkl.dump(lda, f)
        with open(datapath + 'topic_word_distribution.pkl', 'wb') as f:
            pkl.dump(lda.components_, f)
    else:
        with open(datapath + 'lda_model.pkl', 'rb') as f:
            lda = pkl.load(f)
        lda_feature = lda.transform(X)

    with open(datapath + 'doc_topic_distribution.pkl', 'wb') as f:
        pkl.dump(lda_feature, f)


if __name__ == '__main__':
    stopwords = load_stopwords()

    build_entity_feature_with_description(datapath, stopwords=stopwords)
    build_text_feature(datapath, DATASETS, stopwords=stopwords)
    build_topic_feature_sklearn(datapath,
                                DATASETS,
                                stopwords=stopwords,
                                train=True)
예제 #12
0
def main(args):  
    # read configurations
    while True:
        try:
            with open('config/config.yml', 'rb') as f:
                config = yaml.load(f, Loader=yaml.FullLoader)
                break
        except Exception as e:
            logging.exception(e)

    path_cfg = config['paths']
    main_cfg = config['main']
    log_cfg = config['logging']
    pre_cfg = config['preprocessing']
    recom_cfg = config['recommendation']
    mq_cfg = config['message_queue']
    misc_cfg = config['miscellaneous']
    special_cfg = config['special_topics']
    logger = utils.get_logger_with_config(name=log_cfg['run_log_name'],
                                          logger_level=log_cfg['log_level'],
                                          handler_levels=log_cfg['handler_levels'],
                                          log_dir=log_cfg['dir'],
                                          mode=log_cfg['mode'],
                                          log_format=log_cfg['format'])


    # load stopwords
    stopwords = utils.load_stopwords(path_cfg['stopwords'])

    preprocessor = TextPreprocessor(singles=pre_cfg['singles'],
                                    puncs=pre_cfg['punctuations'],
                                    punc_frac_low=pre_cfg['min_punc_frac'],
                                    punc_frac_high=pre_cfg['max_punc_frac'],
                                    valid_count=pre_cfg['min_count'],
                                    valid_ratio=pre_cfg['min_ratio'],
                                    stopwords=stopwords)

    topics = CorpusSimilarity(name='TOPICS',
                              time_decay=recom_cfg['time_decay_base'],
                              duplicate_thresh=recom_cfg['duplicate_thresh'],
                              irrelevant_thresh=recom_cfg['irrelevant_thresh'],
                              max_recoms=recom_cfg['max_stored'],
                              logger=utils.get_logger(log_cfg['run_log_name']+'.topics')
                              )

    specials = CorpusTfidf(name='SPECIAL TOPICS',
                           target_corpus=topics,
                           tfidf_scheme=special_cfg['smartirs_scheme'],
                           num_keywords=special_cfg['num_keywords'],
                           time_decay=recom_cfg['time_decay_base'],
                           max_recoms=recom_cfg['max_stored_special'],
                           logger=utils.get_logger(log_cfg['run_log_name']+'.specials')
                           )

    # load previously saved corpus and similarity data if possible
    if args.l:
        try:
            topics.load(path_cfg['topic_save'])
        except FileNotFoundError:
            logger.exception('Topic data files not found. New files will be created')
        try:
            specials.load(path_cfg['special_save'])
        except FileNotFoundError:
            logger.exception('Special topic data files not found. New files will be created')

    # establish rabbitmq connection and declare queues
    if args.c:
        credentials = pika.PlainCredentials(username=mq_cfg['username'],
                                            password=mq_cfg['password'])
        params = pika.ConnectionParameters(host=mq_cfg['host'],
                                           credentials=credentials)
    else:
        params = pika.ConnectionParameters(host='localhost')

    lock = threading.Lock()
    save_topics = Save(topics=topics,
                       specials=specials,
                       interval=main_cfg['save_every'],
                       lock=lock,
                       topic_path=path_cfg['topic_save'],
                       specials_path=path_cfg['special_save'],
                       mod_num=misc_cfg['num_topic_files_per_folder'])
    
    save_topics.start()

    delete_topics = Delete(topics=topics,
                           interval=main_cfg['delete_every'],
                           keep_days=main_cfg['keep_days'],
                           lock=lock,
                           logger=utils.get_logger(log_cfg['run_log_name']+'.topics'))

    delete_topics.start()
    
    while True:       
        try:
            exchange = mq_cfg['exchange_name']
            connection = pika.BlockingConnection(params)
            channel = connection.channel()
            channel.basic_qos(prefetch_count=1)
            channel.exchange_declare(exchange=mq_cfg['exchange_name'], 
                                     exchange_type='direct')
          
            channel.queue_declare(queue='new_topics')
            channel.queue_declare(queue='old_topics')
            channel.queue_declare(queue='special_topics')
            channel.queue_declare(queue='delete_topics')

            channel.queue_bind(exchange=exchange, 
                               queue='new_topics', routing_key='new')
            channel.queue_bind(exchange=exchange,
                               queue='old_topics', routing_key='old')
            channel.queue_bind(exchange=exchange,
                               queue='special_topics', routing_key='special')
            channel.queue_bind(exchange=exchange, 
                               queue='delete_topics', routing_key='delete')
            
            def decode_to_dict(msg):
                while type(msg) != dict:
                    msg = json.loads(msg)
                return msg

            def get_topic_data(topic):
                topic = decode_to_dict(topic)
                topic_id = str(topic['topicID'])
                content = preprocessor.preprocess(topic['body']) if 'body' in topic else []
                date = topic['postDate']//misc_cfg['timestamp_factor'] if 'postDate' in topic else -1

                return topic_id, content, date

            def on_new_topic(ch, method, properties, body):
                topic_id, content, date = get_topic_data(body)

                with lock:
                    topics.add(topic_id, content, date)
                    specials.update_on_new_topic(topic_id, content, date)

                channel.basic_ack(delivery_tag=method.delivery_tag)      

            def on_old_topic(ch, method, properties, body):
                topic_id, content, date = get_topic_data(body)
                logger.info('Received old topic %s', topic_id)
                channel.basic_ack(delivery_tag=method.delivery_tag)

                with lock:
                    sim_list = topics.find_most_similar(content)

                sim_list = [tid for tid, val in sim_list][recom_cfg['max_stored']]
                
                channel.basic_publish(exchange=exchange,
                                      routing_key='old',
                                      body=json.dumps(sim_list))

            def on_special_topic(ch, method, properties, body):
                topic_id, content, date = get_topic_data(body)

                with lock:
                    specials.add(topic_id, content, date)
                
                channel.basic_ack(delivery_tag=method.delivery_tag) 

            def on_delete(ch, method, properties, body):
                topic_id, _, _ = get_topic_data(body)
                
                with lock:
                    specials.update_on_delete_topic(topic_id)
                    topics.delete(topic_id)

                channel.basic_ack(delivery_tag=method.delivery_tag)

            channel.basic_consume('new_topics', on_new_topic)
            channel.basic_consume('special_topics', on_special_topic)
            channel.basic_consume('delete_topics', on_delete)
            channel.basic_consume('old_topics', on_old_topic)
            '''
            channel.basic_consume(on_update_topic, queue='update_topics')                                  
            '''    
            logger.info(' [*] Waiting for messages. To exit press CTRL+C')
            channel.start_consuming()
        
        except Exception as e:
            logger.exception(e)
            logger.info('Retrying in %d seconds', main_cfg['retry_every'])
            time.sleep(main_cfg['retry_every'])
    cur.close()
    cur = connection.cursor(cursorclass=MySQLdb.cursors.SSCursor)
    t0 = time.time()

    # Settings dictionary that can be one day stored in a file
    settings = {
        'commit-freq': 200,
        'prune-freq': 4000,
        'speed-freq': 100,
    }

    count = 0
    speed = None
    target = -1

    stopwords = load_stopwords('data/stopwords.txt')

    last_speed_update = time.time()

    for page_title, page_text in extract_wiki_pages(path):
        count += 1

        # If a target is set, break when reached
        if count == target:
            break

        print(count, page_title, end=' ')

        if cont_flag:
            if page_title == last_page_title:
                cont_flag = False
예제 #14
0
config.read_file(codecs.open(path_to_root + 'config.ini', encoding='utf-8'))

nltk.data.path.append(path_to_resources + 'nltk_data/')

# #########################
# ### RESOURCES LOADING ###
# #########################
resources = {}
for language in ['fr', 'en']:

    print "loading resources..."
    start = time.time()

    URIs = config['URI_' + language]
    stopwords = utils.load_stopwords(
        path_to_resources + URIs['stopwords']
    )

    filler_words = utils.load_filler_words(
        path_to_resources + URIs['filler_words']
    )

    word_vectors = KeyedVectors.load_word2vec_format(
        path_to_resources + URIs['word_vectors'],
        binary=True
    )

    language_model = LanguageModel(
        path_to_resources + URIs['language_model']
    )