def main(): with open("./emrData/liran/2258.txt", 'r') as f: text = f.read() print(text) text = remove_special_chars(text) wordseg = list(jieba.cut(text, cut_all=False, HMM=False)) print("默认词典: " + " ".join(wordseg)) # -------------- 自定义词库达到更改默认分词的目的 ------------------------- jieba.load_userdict("./models_v3.4.0/lexicon.txt") # jieba.set_dictionary('filename') # 重新设置词典 wordseg = list(jieba.cut(text, cut_all=False, HMM=False)) print("用户词典: " + " ".join(wordseg)) # ------------------------- 去掉停用词 ------------------------- stopwords = load_stopwords('./models_v3.4.0/stopWords.txt') # 加载停用词列表 wordseg_filtered = remove_stop_words(wordseg, stopwords) print("过滤停用词:", wordseg_filtered) # --------------- 词性标注 ------------------------- wordseg_with_postag = pseg.cut(text) print("词性标注:", "/".join(["%s %s" % (w.word, w.flag) for w in wordseg_with_postag])) nltk.download('averaged_perceptron_tagger') nltk.download('maxent_ne_chunker') nltk.download('words') tagged = nltk.pos_tag(wordseg) # 词性标注 print("tagged", tagged)
def __init__(self, file_stopwords, file_word2vec_bin, file_sent2vec, file_doc2vec): self.stop_words = load_stopwords(file_stopwords) self.file_word2vec_bin = file_word2vec_bin self.file_sent2vec = file_sent2vec self.file_doc2vec = file_doc2vec pass
def __init__(self, N=1): self.N = N # 加载禁用词集合 self.stopwords = load_stopwords() self.gram2id = {} self.id2gram = {} self.length = 0 self.gram2count = {}
def run_init(): """初始化检索系统 """ print('开始初始化检索系统...') data = load_dataset() # 初始化字段 stoplist = list(load_stopwords()) analyzer = ChineseAnalyzer(stoplist=stoplist) schema = Schema(pid=ID(stored=True), topic=ID(stored=True), method=ID(stored=True), context=TEXT(stored=True, analyzer=analyzer), response=TEXT(stored=True, analyzer=analyzer)) # 创建索引文件保存目录 if not os.path.exists(CONFIG.get('IR_DIR')): os.mkdir(CONFIG.get('IR_DIR')) idx = create_in(CONFIG.get('IR_DIR'), schema) # 构建索引 print('开始构建索引...') count = {} writer = idx.writer() # 所有主题 for topic in data: # 特定主题下所有帖子 for sess in tqdm(data.get(topic)): pid = sess.get('pid') all_pairs = preprocess(sess) # 添加两种类型对话对 (context, response) for method, pairs in all_pairs.items(): for i, pair in enumerate(pairs): writer.add_document(topic=topic, method=method, context=pair[0], response=pair[1], pid=f'{pid}-{method}-{i}') count[method] = count.get(method, 0) + 1 print('开始写入索引...') writer.commit() print(f'初始化成功,写入情况如下:') print(count)
def __init__(self, model_file='./model/lda_model.model'): self.stopwords = utils.load_stopwords() self.num_topics = 50 if os.path.isfile('./data/dictionary.pkl'): with open('./data/dictionary.pkl', 'rb') as f: self.dictionary = pickle.load(f) with open('./data/tfidf.pkl', 'rb') as f: self.tfidf = pickle.load(f) else: self.dictionary, self.tfidf = self.create_dictionary() if os.path.isfile(model_file): self.model = LdaModel.load(model_file) else: self.model = self.train(path_save=model_file)
def __init__(self, collections, topic): self.cluster = pymongo.MongoClient( 'mongodb://localhost:50082/', unicode_decode_error_handler='ignore') self.db = self.cluster.COVID2020 if collections == "v2": self.collections_tweet = self.db.COVID2020_v2 if collections == "v3": self.collections_tweet = self.db.COVID2020_v3 if collections == "v4": self.collections_tweet = self.db.COVID2020_v4 if collections == "v5": self.collections_tweet = self.db.COVID2020_v5 if topic == "1": self.topic = [ x.strip() for x in open("./../topic_list/1.txt", "r").readlines() ] if topic == "2": self.topic = [ x.strip() for x in open("./../topic_list/2.txt", "r").readlines() ] if topic == "3": self.topic = [ x.strip() for x in open("./../topic_list/3.txt", "r").readlines() ] if topic == "depression": self.topic = [ x.strip() for x in open("./../depression_list/depression_list.txt", "r").readlines() ] self.stopwords = utils.load_stopwords() folder = input("Which database? depression or topic?: ") self.path = "./../data/database/" + str(folder) + "/"
# f.close() print '\t' + system_name, param_id domain = 'meeting' # meeting dataset_id = 'ami' # ami, icsi language = 'en' # en, fr development_or_test = 'development' # development / test # ######################### # ### RESOURCES LOADING ### # ######################### if domain == 'meeting': path_to_stopwords = path_to_root + 'resources/stopwords/meeting/stopwords.' + language + '.dat' stopwords = utils.load_stopwords(path_to_stopwords) if dataset_id == 'ami': ids = meeting_lists.ami_development_set \ if development_or_test == 'development' \ else meeting_lists.ami_test_set elif dataset_id == 'icsi': ids = meeting_lists.icsi_development_set \ if development_or_test == 'development' \ else meeting_lists.icsi_test_set if language == 'en': path_to_wv = path_to_root + 'resources/GoogleNews-vectors-negative300.bin.gz' path_to_lm = path_to_root + 'resources/en-70k-0.2.lm' # Load Word2Vec (takes approx. 8G RAM)
# Alternatives # Controller_object (Controller) # Want_suspect (Want) # Building_subparts (Building) # Hedging # Be_in_agreement_on_action (agreement) # Disgraceful_situation (situation) # Change_event_duration (event) # Intentional_deception (deception) ANNOTATIONS_PATH = 'annotations.csv' STOPWORDS_PATH = 'stop_words_FULL.txt' if __name__ == '__main__': annotations = read_annotations(ANNOTATIONS_PATH) stopw = load_stopwords(STOPWORDS_PATH) res_rows = [] giuste = 0 for frame, word, target_synset in annotations: # Se non è disponibile il mapping if target_synset is None: giuste += 1 continue # Rimuovo il PoS dalla parola estratta dalle annotazioni input_word = word.split('.')[0] # disambiguo la parola input_word (quella centrale # nelle annotazioni) usando il frame come contesto synset = lesk_disambiguate(frame, input_word, stopwordset=stopw)
# -*- coding: utf-8 -*- import json import pandas as pd from flask import Flask, request from hierarchical_clustering import HierarchicalClustering from distance_measures import lcs_distance from utils import preprocess, load_stopwords app = Flask(__name__) stopwords = load_stopwords('stopwords.txt') models = {} @app.route('/') def hello(): return 'hello' @app.route('/create/<model_name>') def create(model_name): app.logger.info('model {} has been created'.format(model_name)) model = HierarchicalClustering(model_name, lcs_distance, 0.7, True) models[model_name] = model return 'creation finished' @app.route('/clear/<model_name>') def clear(model_name): model = models[model_name] model.clear()
from __future__ import division import math import MySQLdb import numpy as np from numpy.linalg import norm from collections import Counter from utils import to_csv, load_stopwords from textparser import word_tokenize, tfidf # Will search in CWD, PYTHONPATH and PATH stopwords = load_stopwords('data/stopwords.txt') class SearchResult(object): def __init__(self, page_id, page_name, vector, weight): self.page_id = page_id self.page_name = page_name self.vector = vector self.weight = weight self.incoming = None self.outgoing = None def __repr__(self): return '%s (%d): %f' % (self.page_name, self.page_id, self.weight) class WikiIndex(object): """
n_components=TopicNum, max_iter=1200, learning_method='batch', n_jobs=-1, doc_topic_prior=alpha, topic_word_prior=beta, verbose=1, ) lda_feature = lda.fit_transform(X) with open(datapath + 'lda_model.pkl', 'wb') as f: pkl.dump(lda, f) with open(datapath + 'topic_word_distribution.pkl', 'wb') as f: pkl.dump(lda.components_, f) else: with open(datapath + 'lda_model.pkl', 'rb') as f: lda = pkl.load(f) lda_feature = lda.transform(X) with open(datapath + 'doc_topic_distribution.pkl', 'wb') as f: pkl.dump(lda_feature, f) if __name__ == '__main__': stopwords = load_stopwords() build_entity_feature_with_description(datapath, stopwords=stopwords) build_text_feature(datapath, DATASETS, stopwords=stopwords) build_topic_feature_sklearn(datapath, DATASETS, stopwords=stopwords, train=True)
def main(args): # read configurations while True: try: with open('config/config.yml', 'rb') as f: config = yaml.load(f, Loader=yaml.FullLoader) break except Exception as e: logging.exception(e) path_cfg = config['paths'] main_cfg = config['main'] log_cfg = config['logging'] pre_cfg = config['preprocessing'] recom_cfg = config['recommendation'] mq_cfg = config['message_queue'] misc_cfg = config['miscellaneous'] special_cfg = config['special_topics'] logger = utils.get_logger_with_config(name=log_cfg['run_log_name'], logger_level=log_cfg['log_level'], handler_levels=log_cfg['handler_levels'], log_dir=log_cfg['dir'], mode=log_cfg['mode'], log_format=log_cfg['format']) # load stopwords stopwords = utils.load_stopwords(path_cfg['stopwords']) preprocessor = TextPreprocessor(singles=pre_cfg['singles'], puncs=pre_cfg['punctuations'], punc_frac_low=pre_cfg['min_punc_frac'], punc_frac_high=pre_cfg['max_punc_frac'], valid_count=pre_cfg['min_count'], valid_ratio=pre_cfg['min_ratio'], stopwords=stopwords) topics = CorpusSimilarity(name='TOPICS', time_decay=recom_cfg['time_decay_base'], duplicate_thresh=recom_cfg['duplicate_thresh'], irrelevant_thresh=recom_cfg['irrelevant_thresh'], max_recoms=recom_cfg['max_stored'], logger=utils.get_logger(log_cfg['run_log_name']+'.topics') ) specials = CorpusTfidf(name='SPECIAL TOPICS', target_corpus=topics, tfidf_scheme=special_cfg['smartirs_scheme'], num_keywords=special_cfg['num_keywords'], time_decay=recom_cfg['time_decay_base'], max_recoms=recom_cfg['max_stored_special'], logger=utils.get_logger(log_cfg['run_log_name']+'.specials') ) # load previously saved corpus and similarity data if possible if args.l: try: topics.load(path_cfg['topic_save']) except FileNotFoundError: logger.exception('Topic data files not found. New files will be created') try: specials.load(path_cfg['special_save']) except FileNotFoundError: logger.exception('Special topic data files not found. New files will be created') # establish rabbitmq connection and declare queues if args.c: credentials = pika.PlainCredentials(username=mq_cfg['username'], password=mq_cfg['password']) params = pika.ConnectionParameters(host=mq_cfg['host'], credentials=credentials) else: params = pika.ConnectionParameters(host='localhost') lock = threading.Lock() save_topics = Save(topics=topics, specials=specials, interval=main_cfg['save_every'], lock=lock, topic_path=path_cfg['topic_save'], specials_path=path_cfg['special_save'], mod_num=misc_cfg['num_topic_files_per_folder']) save_topics.start() delete_topics = Delete(topics=topics, interval=main_cfg['delete_every'], keep_days=main_cfg['keep_days'], lock=lock, logger=utils.get_logger(log_cfg['run_log_name']+'.topics')) delete_topics.start() while True: try: exchange = mq_cfg['exchange_name'] connection = pika.BlockingConnection(params) channel = connection.channel() channel.basic_qos(prefetch_count=1) channel.exchange_declare(exchange=mq_cfg['exchange_name'], exchange_type='direct') channel.queue_declare(queue='new_topics') channel.queue_declare(queue='old_topics') channel.queue_declare(queue='special_topics') channel.queue_declare(queue='delete_topics') channel.queue_bind(exchange=exchange, queue='new_topics', routing_key='new') channel.queue_bind(exchange=exchange, queue='old_topics', routing_key='old') channel.queue_bind(exchange=exchange, queue='special_topics', routing_key='special') channel.queue_bind(exchange=exchange, queue='delete_topics', routing_key='delete') def decode_to_dict(msg): while type(msg) != dict: msg = json.loads(msg) return msg def get_topic_data(topic): topic = decode_to_dict(topic) topic_id = str(topic['topicID']) content = preprocessor.preprocess(topic['body']) if 'body' in topic else [] date = topic['postDate']//misc_cfg['timestamp_factor'] if 'postDate' in topic else -1 return topic_id, content, date def on_new_topic(ch, method, properties, body): topic_id, content, date = get_topic_data(body) with lock: topics.add(topic_id, content, date) specials.update_on_new_topic(topic_id, content, date) channel.basic_ack(delivery_tag=method.delivery_tag) def on_old_topic(ch, method, properties, body): topic_id, content, date = get_topic_data(body) logger.info('Received old topic %s', topic_id) channel.basic_ack(delivery_tag=method.delivery_tag) with lock: sim_list = topics.find_most_similar(content) sim_list = [tid for tid, val in sim_list][recom_cfg['max_stored']] channel.basic_publish(exchange=exchange, routing_key='old', body=json.dumps(sim_list)) def on_special_topic(ch, method, properties, body): topic_id, content, date = get_topic_data(body) with lock: specials.add(topic_id, content, date) channel.basic_ack(delivery_tag=method.delivery_tag) def on_delete(ch, method, properties, body): topic_id, _, _ = get_topic_data(body) with lock: specials.update_on_delete_topic(topic_id) topics.delete(topic_id) channel.basic_ack(delivery_tag=method.delivery_tag) channel.basic_consume('new_topics', on_new_topic) channel.basic_consume('special_topics', on_special_topic) channel.basic_consume('delete_topics', on_delete) channel.basic_consume('old_topics', on_old_topic) ''' channel.basic_consume(on_update_topic, queue='update_topics') ''' logger.info(' [*] Waiting for messages. To exit press CTRL+C') channel.start_consuming() except Exception as e: logger.exception(e) logger.info('Retrying in %d seconds', main_cfg['retry_every']) time.sleep(main_cfg['retry_every'])
cur.close() cur = connection.cursor(cursorclass=MySQLdb.cursors.SSCursor) t0 = time.time() # Settings dictionary that can be one day stored in a file settings = { 'commit-freq': 200, 'prune-freq': 4000, 'speed-freq': 100, } count = 0 speed = None target = -1 stopwords = load_stopwords('data/stopwords.txt') last_speed_update = time.time() for page_title, page_text in extract_wiki_pages(path): count += 1 # If a target is set, break when reached if count == target: break print(count, page_title, end=' ') if cont_flag: if page_title == last_page_title: cont_flag = False
config.read_file(codecs.open(path_to_root + 'config.ini', encoding='utf-8')) nltk.data.path.append(path_to_resources + 'nltk_data/') # ######################### # ### RESOURCES LOADING ### # ######################### resources = {} for language in ['fr', 'en']: print "loading resources..." start = time.time() URIs = config['URI_' + language] stopwords = utils.load_stopwords( path_to_resources + URIs['stopwords'] ) filler_words = utils.load_filler_words( path_to_resources + URIs['filler_words'] ) word_vectors = KeyedVectors.load_word2vec_format( path_to_resources + URIs['word_vectors'], binary=True ) language_model = LanguageModel( path_to_resources + URIs['language_model'] )