def with_synonyme_meal(): for i in range(0,len(label_meal_db)): #for i in range(0,3): label_list=label_meal_db[i] label_id=label_list['id'] label=label_list['name'] label_translate_synonymes=translate_synonymes(label) #label_translate_synonymes=label #label_dic.append({'id': 'doc_%i' % label_id, 'tokens': [label_translate_synonymes], 'payload': label_translate_synonymes}) label_dic.append({'id': 'doc_%i' % label_id, 'tokens': cut(label_translate_synonymes), 'payload': label}) logger.info(i) logger.info('label_id= %s' % label_id) ''' for j in range(0,len(mysql_db)): mysql_data_list=mysql_db[j] article_id=mysql_data_list[0] #id article_label=mysql_data_list[1] #label article_title=mysql_data_list[2] #title article_text=mysql_data_list[4] #text if article_title==None: article_title='' if article_text==None: article_text='' article_title_text=article_title+article_text article_title_text_translate_synonymes=translate_synonymes(article_title_text) article_title_text_dic.append({'id': 'doc_%i' % article_id, 'tokens': cut(article_title_text_translate_synonymes), 'payload': article_title_text}) ''' server_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'servers/create_test_withsyn_meal1',) #--model path server = SessionServer(server_path) server.drop_index() #--删除所有索引 utils.upload_chunked(server, label_dic, chunksize=1000) #--simserver分块处理 server.train(label_dic, method='lsi') #--训练已处理后的问题 server.index(label_dic) #--建立索引文件 #print(server.status()) return None
def GensimClient(texts): similarities = None gsDir = os.getcwd() gss = gsDir + os.sep + u"gensim_server" + os.sep server = SessionServer(gss) logger.debug(u"%s" % server.status()) try: corpus = [{ u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text) } for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) similarities = findSimilar(texts, server, corpus) except Exception, msg: logger.debug(u"%s" % msg)
def main(): json_data = open('./items.json') data = json.load(json_data) print 'starting' for i in range(0, len(data)-1): print i s = "" identifier = "" title = "" totalText = "" try: s = data[i]['identifier'] identifier = s[0][18:].replace("%3A", "") summary = data[i]['desc'][0].strip() title = data[i]['title'][0].strip() totalText += summary totalText += " " totalText += title totalText += " " totalText += identifier except: print "error" documentPayload = ({'identifier':identifier, 'title': title, 'summary' : summary}) documents.append({'text' : totalText, 'payload' : documentPayload}) corpus =[{'id': text['payload']['identifier'], 'tokens' : utils.simple_preprocess(text['text']), 'payload' : text['payload']} for num, text in enumerate(documents)] service = SessionServer('./thesite/simdatabase') service.train(corpus, method='lsi') service.index(corpus) service.commit()
class Indexer(object): def __init__(self): self.server = SessionServer("./tmp") def _create_corpus(self, texts): corpus = [] for id, text in texts: corpus.append({ 'id': id, 'tokens': utils.simple_preprocess(text) }) return corpus def index(self, texts): corpus = self._create_corpus(texts) utils.upload_chunked(self.server, corpus, chunksize=1000) self.server.train(corpus, method='lsi') self.server.index(corpus) def add_documents(self, texts): self.index(texts) def recommend(self, id, max_results=10): print "Id is: ", id return self.server.find_similar(id, max_results=max_results)
def gensimsimserverII (): reloadData = True useremoteserver = False if (useremoteserver): server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) else: server = SessionServer('/tmp/testserver') #SessionServer('myserver') if (reloadData): client = Elasticsearch([Util.config['eshost']]) # response = client.search( # index="blogs", # body={ # "size": "5000", # "query": { # "match": { # "country": country # } # } # } # ) response = client.search( index="blogs", body={ "size": "5000", "query": {"match_all": {}} } ) stops = [unicode(word) for word in stopwords.words('english')] + [u':-).', u'–', u'-', u'…', '!!!', '!!', 'x', 'got', 'get', 'went', 'us', u'i\'m', '&','it\'s', 'i\'ve' ] corpus = [] for hit in response['hits']['hits']: try: body = hit["_source"]["body"] id = hit["_source"]["url"] title = hit["_source"]["title"] newBody = [word for word in body.lower().split() if word not in stops] corpus.append({ 'id': id, 'tokens':newBody, 'title':title }) server.stable.payload[id] = title except Exception: logger.exception("Couldn't parse blog id: {0}".format(hit["_id"])) server.train(corpus, method='lsi') server.index(corpus) print "********************************************" print(server.find_similar('http://www.travelpod.com/travel-blog-entries/bvrlymm/1/1428224775/tpod.html', max_results=5))
def GensimClient(texts): gsDir = os.getcwd() logger.debug(u"GSDir %s" % gsDir) gss = gsDir + os.sep + u"gensim_server" + os.sep logger.debug(u"%s" % gss) server = SessionServer(gss) logger.info(u"%s" % server.status()) corpus = [{u"id": u"url_%i" % n, u"tokens": utils.simple_preprocess(text)} for n, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) # supply a list of document ids to be removed from the index # server.delete(["doc_5", "doc_8"]) # overall index size unchanged (just 3 docs overwritten) server.index(corpus[:3]) # Option Ons for n in range(0, len(corpus)): doc = u"doc_%d" % n logger.info(u"------------------------------------------------------") logger.info(u"Find similar N doc_%d to %s" % (n, corpus[n][u"tokens"])) logger.info(u"------------------------------------------------------") for sim in server.find_similar(doc): m = int(sim[0][-1:]) if m != n: logger.info(u"\t%s \t %3.2f : M %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"])) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[m][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.info(u"\t\tCommon Topics : %s" % (lc)) if False: # Option two doc = {u"tokens": utils.simple_preprocess(str("Graph and minors and humans and trees."))} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
def index_nodes(): print "loading server" service = SessionServer('/mnt/hgfs/Shared/my_server/') print "loading model" service.open_session() service.session.drop_index() service.session.model = simserver.SimModel.load("/mnt/hgfs/Shared/wiki") print service.session.model print "loading nodes" nodes = Node.objects.all() print "Building corpus" corpus = [{'id':node.pk,'tokens':re.findall(r"[\w']+",node.question.lower())} for node in nodes] print "indexing corpus" service.index(corpus) print service.stable.keys service.commit()
def gensimsimserver (): server = SessionServer('myserver') texts = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)} for num, text in enumerate(texts)] server.train(corpus, method='lsi') server.index(corpus) print "********************************************" print(server.find_similar('doc_0'))
class gensim_news(object): def __init__(self): self.server = SessionServer(r'c:\temp\data_server') print self.server def initialise(self, docs): corpus4server = self.create_server_corpus(docs) self.server.train(corpus4server, method='lsi') def create_server_corpus(self, docs): return [{'id': '%s' % id, 'tokens': simple_preprocess(text)} for id, text in docs.iteritems()] def gensim_similarities(self, docs_dict, new=False): text4server = self.create_server_corpus(docs_dict) sims = self.server.find_similar(text4server[0], min_score=0.90) self.server.index(text4server) return sims
class IndexContent: def __init__(self): self.service = SessionServer('SearchServer/') def yield_page_text(self): for page_file in os.listdir('CrawlData'): content = open('CrawlData/' + page_file, 'r') page_content = content.read() content.close() page_url = re.sub('\s', '/', page_file) yield page_url, page_content def generate_index(self): corpus = [{ 'id': '%s' % url, 'tokens': utils.simple_preprocess(text) } for url, text in self.yield_page_text()] self.service.train(corpus, method='lsi') self.service.index(corpus)
class IndexContent: def __init__(self): self.service = SessionServer('SearchServer/') def yield_page_text(self): for page_file in os.listdir('CrawlData'): content = open('CrawlData/'+page_file, 'r') page_content = content.read() content.close() page_url = re.sub('\s', '/', page_file) yield page_url, page_content def generate_index(self): corpus = [{'id': '%s' % url, 'tokens': utils.simple_preprocess(text)} for url, text in self.yield_page_text()] self.service.train(corpus, method='lsi') self.service.index(corpus)
class gensim_news(object): def __init__(self): self.server = SessionServer(r'c:\temp\data_server') print self.server def initialise(self, docs): corpus4server = self.create_server_corpus(docs) self.server.train(corpus4server, method='lsi') def create_server_corpus(self, docs): return [{ 'id': '%s' % id, 'tokens': simple_preprocess(text) } for id, text in docs.iteritems()] def gensim_similarities(self, docs_dict, new=False): text4server = self.create_server_corpus(docs_dict) sims = self.server.find_similar(text4server[0], min_score=0.90) self.server.index(text4server) return sims
class Indexer(object): def __init__(self): self.server = SessionServer("./tmp") def _create_corpus(self, texts): corpus = [] for id, text in texts: corpus.append({'id': id, 'tokens': utils.simple_preprocess(text)}) return corpus def index(self, texts): corpus = self._create_corpus(texts) utils.upload_chunked(self.server, corpus, chunksize=1000) self.server.train(corpus, method='lsi') self.server.index(corpus) def add_documents(self, texts): self.index(texts) def recommend(self, id, max_results=10): print "Id is: ", id return self.server.find_similar(id, max_results=max_results)
def GensimClient(texts): similarities = None gsDir = os.getcwd() gss = gsDir + os.sep + u"gensim_server" + os.sep server = SessionServer(gss) logger.debug(u"%s" % server.status()) try: corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) similarities = findSimilar(texts, server, corpus) except Exception, msg: logger.debug(u"%s" % msg)
class SearchServer: def __init__(self): self.service = SessionServer('SearchServer/') def generate_index(self): def page_text(): for page_file in os.listdir('CrawlData'): content = open('CrawlData/'+page_file, 'r') page_content = content.read() content.close() page_url = re.sub('\s', '/', page_file) yield page_url, page_content corpus = [{'id': '%s' % url, 'tokens': utils.simple_preprocess(text)} for url, text in page_text()] self.service.train(corpus, method='lsi') self.service.index(corpus) def query(self): user_string = raw_input('Enter query: ') doc = {'tokens': utils.simple_preprocess(user_string)} for results in self.service.find_similar(doc, min_score=0.4, max_results=50): print results[0]
def test_Gensim(texts): gsDir = os.getcwd() logger.debug(u"GSDir %s" % gsDir) gss = gsDir + os.sep + u"gensim_server" + os.sep logger.debug(u"%s" % gss) server = SessionServer(gss) u""" texts = [u"Human machine interface for lab abc computer applications", u"A survey of user opinion of computer system response time", u"The EPS user interface management system", u"System and human system engineering testing of EPS", u"Relation of user perceived response time to error measurement", u"The generation of random binary unordered trees", u"The intersection graph of paths in trees", u"Graph minors IV Widths of trees and well quasi ordering", u"Graph minors A survey", u"Why use a computer"] """ logger.info(u"%s" % server.status()) corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) # supply a list of document ids to be removed from the index # server.delete(["doc_5", "doc_8"]) # overall index size unchanged (just 3 docs overwritten) server.index(corpus[:3]) # Option Ons for n in range(0, len(texts)): doc = u"doc_%d" % n logger.info(u"Find similar doc_%d to %s" % (n, corpus[n][u"tokens"])) for sim in server.find_similar(doc): m = int(sim[0][-1:]) if m != n: logger.info(u"\t%s \t %3.2f : %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"])) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[m][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.info(u"\tCommon Topics : %s\n" % (lc)) if False: # Option two doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
class SimService(object): def __init__(self, path, preprocess, deaccent=True, lowercase=True, stemmer=None, stopwords=None): self.service = SessionServer(path) self.deaccent = deaccent self.lowercase = lowercase self.preprocess = preprocess self.stemmer = stemmer self.stopwords = stopwords def find_similar(self, data, min_score, max_results): if isinstance(data, basestring): doc = data.strip() if ' ' in doc: doc = {'tokens': self.preprocess(data, deacc=self.deaccent, lowercase=self.lowercase, errors='ignore', stemmer=self.stemmer, stopwords=self.stopwords)} try: return {'status': 'OK', 'response': self.service.find_similar(doc, min_score=min_score, max_results=max_results)} except ValueError: return {'status': 'NOTFOUND', 'response':[]} else: result = {} for doc in data: try: result[doc] = (self.service.find_similar( doc, min_score=min_score, max_results=max_results)) except ValueError: pass if result: return {'status': 'OK', 'response': result} else: return {'status': 'NOTFOUND', 'response':[]} def _buffer(self, data): i = 0 for d in data: if 'tokens' in d: self.service.buffer([{'id': d['id'], 'tokens': d['tokens']}]) else: self.service.buffer([{'id': d['id'], 'tokens': list(self.preprocess(d['text'], deacc=self.deaccent, lowercase=self.lowercase, errors='ignore', stemmer=self.stemmer, stopwords=self.stopwords))}]) i+=1 return i def train(self, data): self.service.set_autosession(False) self.service.open_session() i = self._buffer(data) self.service.train(method='lsi') logger.info('training complete commit changes') self.service.commit() self.service.set_autosession(True) return {'status': 'OK', 'response':i} def index(self, data): self.service.set_autosession(False) self.service.open_session() i = self._buffer(data) self.service.index() logger.info('indexing complete commit changes') self.service.commit() self.service.set_autosession(True) return {'status': 'OK', 'response':i} def optimize(self): self.service.set_autosession(False) self.service.open_session() self.service.optimize() self.service.commit() self.service.set_autosession(True) return {'status': 'OK', 'response': 'index optimized'} def delete(self, data): self.service.set_autosession(False) self.service.open_session() self.service.delete(data) self.service.commit() self.service.set_autosession(True) return {'status': 'OK', 'response': 'documents deleted'} def status(self): return {'status': 'OK', 'response': self.service.status()} def indexed_documents(self): return {'status': 'OK', 'response': self.service.keys()} def is_indexed(self, doc): return {'status': 'OK', 'response': doc in self.service.keys()}
class Simple_resume_similarity_app_tk(Tkinter.Tk): def __init__(self): Tkinter.Tk.__init__(self) self.initialize() def initialize(self): button = Tkinter.Button(self,text=u"Click Me!", command = self.resume_scoring) button.grid(row = 1, column = 1) self.label1 = Tkinter.Label(self, text = "Click Button To Generate Similarity Score") self.label1.grid(row = 2, column = 1) #self.img = Image.open('C:\\temp\\Resume_Similarity\\Resume_GUI\\wellsfargologo2.gif') #self.img_path = r"C:/temp/Resume_Similarity/Resume_GUI/wellsfargologo2.gif" #self.im = Image.open(self.img_path) #self.ph = PIL.ImageTk.PhotoImage(self.im) #self.label1 = Label(self, image=self.ph) #self.label1.image = self.ph #self.label1.pack(side = "left") #logo = PhotoImage("C:/temp/Resume_Similarity/Resume_match_score/logo.jpg") #label.config(image = logo) def resume_scoring(self): """" Cleanes the data and runs the resume matching code. User is requested to pass the job description name, session_name and final output file name. Final output is an excel file. @param: job_description - string @param: session_name - string @param: output_filename - string Once you run this code it will prompt you to select the path of the directory """ self.job_description = self.select_job_description() if len(self.job_description) > 0: #self.job_description_path = os.path.join( self.job_description_path + "/" + job_description) self.raw_resumes_path =self.select_resume_path() if len(self.raw_resumes_path) > 0: self.save_text_files_path = self.select_rawtext_path() self.raw_resumes_to_text() self.jd_to_text() self.file_list_text = glob.glob(self.save_text_files_path + "/*.*") print self.file_list_text self.resume_id = [] for i in range(0, len(self.file_list_text)): self.resume_id.append([int(s) for s in self.file_list_text[i].split() if s.isdigit()]) self.documents = [] for filename in self.file_list_text: with open(filename, 'r') as f: #d = f.read() #print d self.documents.append(f.read()) self.corpus = [{'id': 'doc_%s' % num, 'tokens': utils.simple_preprocess(text)} for num, text in enumerate(self.documents)] self.count = 0 while self.count < len(self.resume_id): for item in self.corpus: if self.resume_id[self.count] == []: item['id'] = 'doc_jd' else: item['id'] = str(self.resume_id[self.count]) self.count = self.count + 1 self.regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html self.tokenized_corpus_no_punctuation = [] for review in self.corpus: self.new_corpus = [] for token in review: self.new_token = self.regex.sub(u'', token) if not self.new_token == u'': self.new_corpus.append(self.new_token) self.tokenized_corpus_no_punctuation.append(self.new_corpus) self.dir_name = self.setting_up_server_session_dir() self.server = SessionServer(self.dir_name) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.server.train(self.corpus, method='lsi') self.server.index(self.corpus) self.lst = self.server.find_similar('doc_jd') self.series = pd.DataFrame(self.lst) self.series.columns = ['Resume_ID', 'Score', 'none'] self.series.index.names = ['Rank'] self.series = self.series.drop(self.series.columns[2], axis = 1) self.final_excel_path() def setting_up_server_session_dir(self): self.dir = 'C:/temp/resume_server_script_server_logs' if not os.path.exists(self.dir): os.makedirs(self.dir) else: shutil.rmtree(self.dir) #removes all the subdirectories! os.makedirs(self.dir) return self.dir def convert(self,fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text def select_job_description(self): root = Tkinter.Tk() root.withdraw() #use to hide tkinter window currdir = os.getcwd() self.tempdir = tkFileDialog.askopenfilename(parent=root, initialdir=currdir, title="Select Job Description file") if len(self.tempdir) > 0: return self.tempdir def select_resume_path(self): root = Tkinter.Tk() root.withdraw() #use to hide tkinter window currdir = os.getcwd() tempdir = tkFileDialog.askdirectory(parent=root, initialdir=currdir, title="Select Resume Description Path") if len(tempdir) > 0: return tempdir def select_rawtext_path(self): root = Tkinter.Tk() root.withdraw() #use to hide tkinter window currdir = os.getcwd() tempdir = tkFileDialog.askdirectory(parent=root, initialdir=currdir, title="Select Path Where You Want To Save Text Files.") if len(tempdir) > 0: return tempdir def final_excel_path(self): root = Tkinter.Tk() root.withdraw() #use to hide tkinter window currdir = os.getcwd() savefile = tkFileDialog.asksaveasfilename(filetypes=(("Excel files", "*.xlsx"), ("All files", "*.*") ), parent=root, initialdir=currdir, title="Final Excel Output Path") if len(savefile) > 0: self.series.to_excel(savefile + ".xlsx", index=True, sheet_name="Results") def raw_resumes_to_text(self): ## Reading the files path file_list_raw = glob.glob(self.raw_resumes_path + "/*.*") for fp in file_list_raw: # print fp ext = os.path.splitext(fp)[-1].lower() base = os.path.basename(fp) file_name = os.path.splitext(base)[0] #print ext if ext == ".docx": text = textract.process(fp) complte_name = os.path.join(self.save_text_files_path + "/" + file_name + ".txt") with open(complte_name, 'w') as f: f.write(text) elif ext == ".pdf": text = self.convert(fp) complte_name = os.path.join(self.save_text_files_path + "/" + file_name + ".txt") with open(complte_name, 'w') as f: f.write(text) elif ext == ".txt": shutil.copy(os.path.join(self.raw_resumes_path + str("/") + file_name + ".txt"), os.path.join(self.save_text_files_path + str("/") + file_name + ".txt")) else: print "Unable to recognise this format." def jd_to_text(self): ext = os.path.splitext(self.job_description)[-1].lower() file_name_with_ext = os.path.basename(self.job_description) file_name = os.path.splitext(file_name_with_ext)[0].lower() if ext == ".docx": text = textract.process(self.job_description) complte_name = os.path.join(self.save_text_files_path + "/" + file_name + ".txt") with open(complte_name, 'w') as f: f.write(text) elif ext == ".pdf": text = convert(self.job_description) complte_name = os.path.join(self.save_text_files_path + "/" + file_name + ".txt") with open(complte_name, 'w') as f: f.write(text) elif ext == ".txt": shutil.copy(self.job_description, os.path.join(self.save_text_files_path + str("/") + file_name + ".txt")) else: print "This file format is not supported for now."
class DocSimServer(object): def __init__(self): self.server = SessionServer(settings.SIMSERVER_WORKING_DIR) if not self.server.stable.model: self.server.train(self.corpus) if not self.server.stable.fresh_index: self.server.index(self.corpus) def find_similar(self, *args, **kwargs): return self.server.find_similar(*args, **kwargs) @property def corpus(self): try: return self._corpus except AttributeError: logging.info('creating corpus from DB') self._corpus = [dict(id=doc.id, tokens=doc.tokens()) for doc in Document.objects.all()] return self._corpus @property def document_ids(self): try: return self._document_ids except AttributeError: self._document_ids = list( Document.objects.values_list('id', flat=True).order_by('id')) return self._document_ids @property def index_id(self): try: return self._index_id except AttributeError: self._index_id = dict(enumerate(self.document_ids)) return self._index_id @property def id_index(self): try: return self._id_index except AttributeError: self._id_index = dict((v, k) for k, v in self.index_id.iteritems()) return self._id_index def similarity_matrix(self): logging.info('calculating similarity matrix') s = identity(len(self.id_index)) for id in self.document_ids: for sim_id, score, none in self.server.find_similar( id, min_score=.2, max_results=10000): if sim_id != id: s[self.id_index[id]][self.id_index[sim_id]] = score return s @property def distance_matrix(self): try: return self._distance_matrix except AttributeError: s = self.similarity_matrix() logging.info('converting similarity matrix to distance matrix') self._distance_matrix = 2 * (1 - s) return self._distance_matrix def dbscan_clusters(self, eps=.4, min_samples=5): D = self.distance_matrix logging.info('starting dbscan') dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed') db = dbscan.fit(D) labels = db.labels_ clusters = [l for l in set(labels) if l > 0] # outliers are -1 logging.info('found %i clusters' % len(clusters)) for c in clusters: cluster = Cluster( parameters=dict(algorithm='DBSCAN', eps=eps, min_samples=min_samples)) cluster.save() doc_ids = [self.index_id[i[0]] for i in argwhere(labels == c)] logging.info( 'cluster %s: %s documents' % (cluster.id, len(doc_ids))) cluster.documents.add(*doc_ids)
print("len(corpus) = ", len(corpus_A)) print("len(doc_title_list) = ", len(doc_title_list)) print("i_num_tokens_max = ", i_num_tokens_max) print("i_num_tokens_min = ", i_num_tokens_min) #sys.exit(0) utils.upload_chunked(server, corpus, chunksize=1000) # send 1k docs at a time #service = SessionServer('C:/0_afc_working/0_Doc2Vec/gensim-simserver-master/my_server/') # or wherever service = SessionServer(folder_B) # or wherever logger.info("simberver_local_A: service.train(corpus, method='lsi')") service.train(corpus, method='lsi') service.index(corpus) # index the same documents that we trained on... #sys.exit(0) #service.delete(['doc_5', 'doc_8']) # supply a list of document ids to be removed from the index #service.index(corpus[:3]) # overall index size unchanged (just 3 docs overwritten) ##print(service.find_similar('doc_0')) ##print(service.find_similar('02456-deep-learning')) #print(service.find_similar('02456-deep-learning', min_score=0.5, max_results=11)) ##[('doc_0', 1.0000001192092896, None), ('doc_2', 0.11294259130954742, None), ('doc_1', 0.09881371259689331, None), ('doc_3', 0.08786647021770477, None)] '''[('02456-deep-learning', 1.0, None), ('deep-docker', 0.8198882937431335, None), ('nvidia-docker-compose', 0.8188392519950867, None), ('introtodeeplearning_labs', 0.8047046661376953, None), ('dl-docker', 0.8002966642379761, None), ('pydocker-template', 0.7893627285957336, None), ('nvidia-docker', 0.7853963375091553, None), ('-deep-deep', 0.7793657779693604, None), ('DockerFiles-tdeboissiere', 0.7759870290756226, None), ('dockerfiles--Kaixhin', 0.7693668007850647, None),
from simserver import SessionServer from gensim import utils import itertools from pymongo import MongoClient sim_server = SessionServer('./tmp/idea_match_server') client = MongoClient('localhost', 3001) db = client.meteor cursor = db.ideas.find({}) corpus = [{ 'id': idea['_id'], 'tokens': utils.simple_preprocess(idea['text']) } for idea in cursor] utils.upload_chunked(sim_server, corpus, chunksize=1000) sim_server.train(corpus, method='lsi') sim_server.index(corpus) app = Flask(__name__) app.config['MONGO_HOST'] = 'localhost' app.config['MONGO_PORT'] = 3001 app.config['MONGO_DBNAME'] = 'meteor' mongo = PyMongo(app) class Idea(Document): structure = { 'text': unicode, 'parent_id': unicode, 'date_created': datetime.datetime, 'status': int, # open, pending, rejected, filled 'suggested_relations': [],
"The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)} for num, text in enumerate(texts)] utils.upload_chunked(server, corpus, chunksize=1000) # send 1k docs at a time service = SessionServer('C:/0_afc_working/0_Doc2Vec/gensim-simserver-master/my_server/') # or wherever logger.info("simberver_local_A: service.train(corpus, method='lsi')" ) service.train(corpus, method='lsi') service.index(corpus) # index the same documents that we trained on... service.delete(['doc_5', 'doc_8']) # supply a list of document ids to be removed from the index service.index(corpus[:3]) # overall index size unchanged (just 3 docs overwritten) print(service.find_similar('doc_0')) #[('doc_0', 1.0000001192092896, None), ('doc_2', 0.11294259130954742, None), ('doc_1', 0.09881371259689331, None), ('doc_3', 0.08786647021770477, None)] #print(service.find_similar('doc_5')) # we deleted doc_5 and doc_8, remember? #ValueError: document 'doc_5' not in index doc = {'tokens': gensim.utils.simple_preprocess('Graph and minors and humans and trees.')} print(service.find_similar(doc, min_score=0.4, max_results=50)) #[('doc_7', 0.7615688443183899, None), ('doc_3', 0.5443614721298218, None)]
server.train(corpus, method='lsi') #let just index the corpus texts=['如果 也 没有 的话 。 这个 确实 没有 办法 了 。 我 个人 建议您 重装 一遍 这个 软件 看看 是否 还是 一样 卸载 程序 里 也 没有 呢', '我能 直接 删掉 这些 文件 吗 ?', '不 建议 呢 。 因为 不 确定 这些 文件 中 是否 有 其他软件 的 文件 呢', '好 的 , 使用 看看 会断 么', '它 只是 有时 自动 掉 , 以后 看看 怎么样', '这个 是 您 无线 驱动 : http : / / driverdl . lenovo . com . cn / lenovo / driverfilesuploadfloder / 32228 / wlan _ win8 . 1 . exe', '要是 问题 还是 出现 您 可以 安装 这个 试试', '10 几个 版本 都 试过 了 么'] corpus=[{'id': 'doc_%i' % num, 'tokens': text.split()} for num, text in enumerate(texts)] server.index(corpus) server.delete(['doc_5']) # try to del one doc server.commit() texts=['如果 也 没有 的话 。 这个 确实 没有 办法 了 。 我 个人 建议您 重装 一遍 这个 软件 看看 是否 还是 一样 卸载 程序 里 也 没有 呢', '我能 直接 删掉 这些 文件 吗 ?', '不 建议 呢 。 因为 不 确定 这些 文件 中 是否 有 其他软件 的 文件 呢', '好 的 , 使用 看看 会断 么'] corpus=[{'id': 'eval_%i' % num, 'tokens': text.split()} for num, text in enumerate(texts)] #and find_similar for each documents, which should return doc_0, doc_1 doc_2. for s in corpus: print server.find_similar(s)
from mongokit import Document from flask.ext.pymongo import PyMongo import datetime from simserver import SessionServer from gensim import utils import itertools from pymongo import MongoClient sim_server = SessionServer('./tmp/idea_match_server') client = MongoClient('localhost', 3001) db = client.meteor cursor = db.ideas.find({}) corpus = [{'id': idea['_id'], 'tokens': utils.simple_preprocess(idea['text'])} for idea in cursor] utils.upload_chunked(sim_server, corpus, chunksize=1000) sim_server.train(corpus, method='lsi') sim_server.index(corpus) app = Flask(__name__) app.config['MONGO_HOST'] = 'localhost' app.config['MONGO_PORT'] = 3001 app.config['MONGO_DBNAME'] = 'meteor' mongo = PyMongo(app) class Idea(Document): structure = { 'text':unicode, 'parent_id': unicode, 'date_created': datetime.datetime, 'status': int, # open, pending, rejected, filled 'suggested_relations': [],