def with_synonyme_meal(): for i in range(0,len(label_meal_db)): #for i in range(0,3): label_list=label_meal_db[i] label_id=label_list['id'] label=label_list['name'] label_translate_synonymes=translate_synonymes(label) #label_translate_synonymes=label #label_dic.append({'id': 'doc_%i' % label_id, 'tokens': [label_translate_synonymes], 'payload': label_translate_synonymes}) label_dic.append({'id': 'doc_%i' % label_id, 'tokens': cut(label_translate_synonymes), 'payload': label}) logger.info(i) logger.info('label_id= %s' % label_id) ''' for j in range(0,len(mysql_db)): mysql_data_list=mysql_db[j] article_id=mysql_data_list[0] #id article_label=mysql_data_list[1] #label article_title=mysql_data_list[2] #title article_text=mysql_data_list[4] #text if article_title==None: article_title='' if article_text==None: article_text='' article_title_text=article_title+article_text article_title_text_translate_synonymes=translate_synonymes(article_title_text) article_title_text_dic.append({'id': 'doc_%i' % article_id, 'tokens': cut(article_title_text_translate_synonymes), 'payload': article_title_text}) ''' server_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'servers/create_test_withsyn_meal1',) #--model path server = SessionServer(server_path) server.drop_index() #--删除所有索引 utils.upload_chunked(server, label_dic, chunksize=1000) #--simserver分块处理 server.train(label_dic, method='lsi') #--训练已处理后的问题 server.index(label_dic) #--建立索引文件 #print(server.status()) return None
def GensimClient(texts): similarities = None gsDir = os.getcwd() gss = gsDir + os.sep + u"gensim_server" + os.sep server = SessionServer(gss) logger.debug(u"%s" % server.status()) try: corpus = [{ u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text) } for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) similarities = findSimilar(texts, server, corpus) except Exception, msg: logger.debug(u"%s" % msg)
def get_service(): SERVER_DIR = '/tmp/simserver/' try: os.mkdir(SERVER_DIR) except: pass service = SessionServer(SERVER_DIR) service.set_autosession() return service
def service_initialization(directory_path='.', readme_path='.', autosession=True): #'../Extract_features_using_readmeAPIsource/', directory to place this service #'./Readme/Readme_set_complete', directory where the readme file source is stored. service = SessionServer(directory_path, autosession) if 'model' not in os.listdir(directory_path + '/a/'): upload_train(service, readme_path) return service
def with_synoymes_meal(): km_server = SessionServer( os.path.join(servers_path, 'create_test_withsyn_meal1')) #--索引 article_db = db.query('select * from article_all1') min_similarity = 0.1 #0.2 max_results = 5 #2 #db.execute('update article_all1 set meal=null') #initial for i in range(0, len(article_db)): #for i in range(0,3): article_list = article_db[i] article_id = article_list['id'] title = article_list['title'] introduce = article_list['introduce'] content = article_list['content'] js_content = json.loads(content) content_all = '' for at in range(0, len(js_content)): js_content_list = js_content[at] js_content_content = js_content_list['content'] js_content_title = js_content_list['title'] soup_js_content_title = BeautifulSoup(js_content_title) soup_js_content_content = BeautifulSoup(js_content_content) soup_title = soup_js_content_title.get_text() soup_content = soup_js_content_content.get_text() content_all = content_all + soup_title + '.' + soup_content content_all = content_all.replace("\n", "") article = title + '.' + introduce + '.' + content_all #print(article) article_synonymes = translate_synonymes(article) #--数据库问题同义词转换 article_label_list = add_label(article_synonymes, min_similarity, max_results, km_server) #print(article_id) #print(article_id,article_label_list) #print label_list_sql = [] label_list_sql_sim = [] for l in article_label_list: label_id = l[0][4:] similarity = l[1] label = l[2] label_list_sql.append(label) label_list_sql_sim.append((similarity, label)) label_list_sql_sim_json = json.dumps(label_list_sql_sim) #print(article_id,label_id,similarity) #print(article_id) #print(label_id) #db.execute('update article_all1 set meal=%s where id=%s',(label_list_sql,article_id)) db.execute('update article_all1 set meal_sim=%s where id=%s', (label_list_sql_sim, article_id)) db.execute('update article_all1 set meal_sim_json=%s where id=%s', (label_list_sql_sim_json, article_id)) #print(label_list_sql) #print('-'*20) return None
def GensimClient(texts): gsDir = os.getcwd() logger.debug(u"GSDir %s" % gsDir) gss = gsDir + os.sep + u"gensim_server" + os.sep logger.debug(u"%s" % gss) server = SessionServer(gss) logger.info(u"%s" % server.status()) corpus = [{u"id": u"url_%i" % n, u"tokens": utils.simple_preprocess(text)} for n, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) # supply a list of document ids to be removed from the index # server.delete(["doc_5", "doc_8"]) # overall index size unchanged (just 3 docs overwritten) server.index(corpus[:3]) # Option Ons for n in range(0, len(corpus)): doc = u"doc_%d" % n logger.info(u"------------------------------------------------------") logger.info(u"Find similar N doc_%d to %s" % (n, corpus[n][u"tokens"])) logger.info(u"------------------------------------------------------") for sim in server.find_similar(doc): m = int(sim[0][-1:]) if m != n: logger.info(u"\t%s \t %3.2f : M %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"])) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[m][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.info(u"\t\tCommon Topics : %s" % (lc)) if False: # Option two doc = {u"tokens": utils.simple_preprocess(str("Graph and minors and humans and trees."))} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
def findSimilarities(self, texts): gsDir = os.getcwd() logger.debug(u"GSDir %s" % gsDir) gss = gsDir + os.sep + u"gensim_server" + os.sep logger.debug(u"%s" % gss) server = SessionServer(gss) corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)] # send 1k docs at a time # utils.upload_chunked(server, corpus, chunksize=1000) # server.train(corpus, method=u"lsi") # index the same documents that we trained on... # server.index(corpus) # overall index size unchanged (just 3 docs overwritten) # server.index(corpus[:3]) # Option Ons if True: for n in range(0, len(texts)): doc = u"doc_%d" % n self.output += u"Find similar doc_%d to %s%s" % (n, corpus[n][u"tokens"], os.linesep) logger.info(self.output[:-1]) for sim in server.find_similar(doc): m = int(sim[0][-1:]) if m != n: self.output += u"\t%s \t %3.2f : %s%s" % (sim[0], float(sim[1]), corpus[m][u"tokens"], os.linesep) logger.info(self.output[:-1]) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[m][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] self.output += u"\tCommon Topics : %s%s" % (lc, os.linesep) logger.info(self.output[:-1]) else: # Option two doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50)) return self.output
def index_nodes(): print "loading server" service = SessionServer('/mnt/hgfs/Shared/my_server/') print "loading model" service.open_session() service.session.drop_index() service.session.model = simserver.SimModel.load("/mnt/hgfs/Shared/wiki") print service.session.model print "loading nodes" nodes = Node.objects.all() print "Building corpus" corpus = [{'id':node.pk,'tokens':re.findall(r"[\w']+",node.question.lower())} for node in nodes] print "indexing corpus" service.index(corpus) print service.stable.keys service.commit()
def __init__(self): self.server = SessionServer(r'c:\temp\data_server') print self.server
def __init__(self): self.service = SessionServer('SearchServer/')
#!/usr/bin/python # -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding("utf-8") import logging #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from gensim import utils from simserver import SessionServer service = SessionServer('c:/temp/gensim') # or wherever def index_input_texts(): texts = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)} for num, text in enumerate(texts)] # service.index(corpus) service.train(corpus, method='lsi') service.index(corpus) # index the same documents that we trained on... def query_the_index(input): doc = {'tokens': utils.simple_preprocess(input)}
while n.content[0:3] == '-->': if n.content[3:5] == '*.': if Tag.objects.filter(title=n.content[5:]).exists(): tag = Tag.objects.get(title=n.content[5:]) if tag.node_set.all().exists(): n = choice(tag.node_set.all()) else: if Node.objects.filter(title=n.content[3::]).exists(): n = Node.objects.get(title=n.content[3::]) else: log.debug('%s not found' % name) n = Node.objects.get(title='idk') context = {'reply': parse_content(n.content, 'display'), 'title':n.title} return json_response(context), n service = SessionServer('/mnt/hgfs/Shared/my_server/') service.stable.model = simserver.SimModel.load("/mnt/hgfs/Shared/wiki") def parse(arguments, method): name = arguments['name'] if method == 'GET': n = None while not n: matches = service.find_similar({'tokens':re.findall(r"[\w']+", name)},.9) if len(matches): n = Node.objects.get(pk=matches[0][0]) else: matches = service.find_similar({'tokens':re.findall(r"[\w']+",name)},.8) if len(matches): n = Node.objects.get(pk=matches[0][0]) else:
'v3', 'v4', 'v5', 'v9', 'w', 'x', 'z' ] i_tag_num_threshold = 5 #=========================== #=========================== i_1000_flag = 1 #i_1000_flag = 0 #=========================== #=========================== #server = SessionServer('/tmp/my_server') # resume server (or create a new one) #server = SessionServer('./my_server') # resume server (or create a new one) #server = SessionServer('./my_server_A') # resume server (or create a new one) server = SessionServer(folder_A) # resume server (or create a new one) import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger('gensim.similarities.simserver') def load_words(): with open('words_alpha.txt') as word_file: valid_words = set(word_file.read().split()) return valid_words
doc['id'] = 'html_%d' % obj.id doc['tokens'] = list(Tokenize(obj.content)) if obj.id % 1000 == 0: print 'processing', obj.id yield doc def iter_corpus(): for obj in SogouCorpus.objects.all(): doc = {} doc['id'] = 'sogou_%d' % obj.id doc['tokens'] = obj.tokens.split(',') if obj.id % 1000 == 0: print 'processing', obj.id yield doc server = SessionServer('/tmp/server') #server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) def train_server(): training_corpus = iter_documents() #training_corpus = iter_corpus() #server.train(list(training_corpus), method='lsi') #print 'train finished' server.index(training_corpus) print 'index finished' server.optimize() print 'optimize finished' def update_keywords(): for html in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')): html.tags,html.summerize = summarize(html.content) html.summerize = html.summerize[0:388]
tokens = preprocessor.tokenize(qtext) tokens = map(preprocessor.deNoise, tokens) devocalize_tokens = map(preprocessor.removeDiacritics, tokens) denoised_tokens = map(preprocessor.deNoise, devocalize_tokens) normalized_tokens = map(preprocessor.normalizeAlef, denoised_tokens) normalized_tokens = map(preprocessor.normalizeAggressive, normalized_tokens) lemmatized_tokens = map(preprocessor.lemmatize, normalized_tokens) yield LabeledSentence(words=[w for w in tokens], tags=['%s' % qid]) from simserver import SessionServer service = SessionServer('tmp/') service.train(corpus, method='lsi') import sys class QuestionPairSimilarity(object): def __iter__(self): qs = LabeledQuestion('input/SemEval2016-Task3-CQA-MD-test.xml') for q in qs: service.drop_index() qid = q.tags[0] print qid
def __init__(self): self.server = SessionServer("./tmp")
def resume_scoring(self): """" Cleanes the data and runs the resume matching code. User is requested to pass the job description name, session_name and final output file name. Final output is an excel file. @param: job_description - string @param: session_name - string @param: output_filename - string Once you run this code it will prompt you to select the path of the directory """ self.job_description = self.select_job_description() if len(self.job_description) > 0: #self.job_description_path = os.path.join( self.job_description_path + "/" + job_description) self.raw_resumes_path =self.select_resume_path() if len(self.raw_resumes_path) > 0: self.save_text_files_path = self.select_rawtext_path() self.raw_resumes_to_text() self.jd_to_text() self.file_list_text = glob.glob(self.save_text_files_path + "/*.*") print self.file_list_text self.resume_id = [] for i in range(0, len(self.file_list_text)): self.resume_id.append([int(s) for s in self.file_list_text[i].split() if s.isdigit()]) self.documents = [] for filename in self.file_list_text: with open(filename, 'r') as f: #d = f.read() #print d self.documents.append(f.read()) self.corpus = [{'id': 'doc_%s' % num, 'tokens': utils.simple_preprocess(text)} for num, text in enumerate(self.documents)] self.count = 0 while self.count < len(self.resume_id): for item in self.corpus: if self.resume_id[self.count] == []: item['id'] = 'doc_jd' else: item['id'] = str(self.resume_id[self.count]) self.count = self.count + 1 self.regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html self.tokenized_corpus_no_punctuation = [] for review in self.corpus: self.new_corpus = [] for token in review: self.new_token = self.regex.sub(u'', token) if not self.new_token == u'': self.new_corpus.append(self.new_token) self.tokenized_corpus_no_punctuation.append(self.new_corpus) self.dir_name = self.setting_up_server_session_dir() self.server = SessionServer(self.dir_name) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.server.train(self.corpus, method='lsi') self.server.index(self.corpus) self.lst = self.server.find_similar('doc_jd') self.series = pd.DataFrame(self.lst) self.series.columns = ['Resume_ID', 'Score', 'none'] self.series.index.names = ['Rank'] self.series = self.series.drop(self.series.columns[2], axis = 1) self.final_excel_path()
# -*- coding: utf-8 -*- """ Created on Mon Sep 10 14:34:49 2018 @author: afcarl """ from gensim import utils from simserver import SessionServer import gensim #server = SessionServer('/tmp/my_server') # resume server (or create a new one) server = SessionServer('./my_server') # resume server (or create a new one) import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger('gensim.similarities.simserver') document = {'id': 'some_unique_string', 'tokens': ['content', 'of', 'the', 'document', '...'], 'other_fields_are_allowed_but_ignored': None} from gensim import utils texts = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees",
import json from bson import json_util from bson.objectid import ObjectId from flask import Flask, request from mongokit import Document from flask.ext.pymongo import PyMongo import datetime from simserver import SessionServer from gensim import utils import itertools from pymongo import MongoClient sim_server = SessionServer('./tmp/idea_match_server') client = MongoClient('localhost', 3001) db = client.meteor cursor = db.ideas.find({}) corpus = [{ 'id': idea['_id'], 'tokens': utils.simple_preprocess(idea['text']) } for idea in cursor] utils.upload_chunked(sim_server, corpus, chunksize=1000) sim_server.train(corpus, method='lsi') sim_server.index(corpus) app = Flask(__name__) app.config['MONGO_HOST'] = 'localhost' app.config['MONGO_PORT'] = 3001 app.config['MONGO_DBNAME'] = 'meteor' mongo = PyMongo(app)
import os from gensim import utils from simserver import SessionServer server = SessionServer('myserver') w = open('data/1/549518.txt').read() docin = {'id': '549518', 'tokens' : utils.simple_preprocess(w)} print server.find_similar(docin)
#an example by Steven Du, showing how to use this server for Chinese documents # train: let the server learn the LSI model # index: setup your own pool of documents that you want the query to search # find_similar : find the similar documents in the indexed pool of documents. # Input to this server (train,index,find_similar) is a list of {'id': 'doc_%i' % num, 'tokens': text.split()} from simserver import SessionServer import codecs import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) serverFilePath='./temp_index_dir' server = SessionServer(serverFilePath) # resume server (or create a new one) texts=['如果 也 没有 的话 。 这个 确实 没有 办法 了 。 我 个人 建议您 重装 一遍 这个 软件 看看 是否 还是 一样 卸载 程序 里 也 没有 呢', '我能 直接 删掉 这些 文件 吗 ?', '不 建议 呢 。 因为 不 确定 这些 文件 中 是否 有 其他软件 的 文件 呢', '好 的 , 使用 看看 会断 么', '它 只是 有时 自动 掉 , 以后 看看 怎么样', '这个 是 您 无线 驱动 : http : / / driverdl . lenovo . com . cn / lenovo / driverfilesuploadfloder / 32228 / wlan _ win8 . 1 . exe', '要是 问题 还是 出现 您 可以 安装 这个 试试', '10 几个 版本 都 试过 了 么', '目前 可以 确认 08 版本 以上 正常 运行', '这个 是 电源 吧', 'http : / / weixin . lenovo . com . cn / img / files / user _ files / olhctjgaid22zzdnezguwbxzuxrq / voice / 16 _ 03 _ 17 / 1104209 _ 729724 _ 1458213046 . jpg', '现在 不是 运行 问题 , 是 安装 问题', '点 电源 卸载 没 反应 呢',
from flask import Flask from flask import json from flask import request from flask import Response import os app = Flask(__name__) import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from gensim import utils from simserver import SessionServer #BEFORE TRAINING NEW MODEL - CHANGE PATH BELOW service = SessionServer('/tmp/mirFlickr4500') # FORMAT FOR DATA POSTED TO /index: {"id":NUMBER,"tokens":["STRING","STRING","STRING"]} @app.route('/test', methods=['GET']) def test(): return "server is running" @app.route('/index', methods=['POST']) def indexPhoto(): print(request.json) service.index(request.json) return "Recieved: " + json.dumps(request.json)
#coding=utf-8 from simserver import SessionServer server = SessionServer('/tmp/my_server') # resume server (or create a new one)
def __init__(self): self.service = SessionServer('SearchServer/') self.search_results = []
for link in urls: print "Reading page: " + `link` status, response = http.request(link) crawldocs[link] = response len(crawldocs) for link, raw_html in crawldocs.iteritems(): maincontent[link] = g.extract(raw_html = raw_html) len(maincontent) response originaldoc_maincontent = g.extract(raw_html=response) from simserver from simserver import SessionServer from simserver import SessionServer server = SessionServer('/tmp/my_simserver') import logging corpus = [{'id corpus = [{'id':link, 'tokens':content.cleaned_text} for link, content in maincontent.iteritems()] len(corpus) corpus[1] from gensim import utils service.train(corpus, method='lsi') server.train(corpus, method='lsi') corpus = [{'id':link, 'tokens':(content.cleaned_text,)} for link, content in maincontent.iteritems()] corpus[1] server.train(corpus, method='lsi') server.index(corpus) corpus = [{'id':link, 'tokens':utils.simple_preprocess(content.cleaned_text,)} for link, content in maincontent.iteritems()] server.train(corpus, method='lsi') server.index(corpus)