def findsimilarity(note): matchbooks = [] h = codecs.open(note, "r") text = h.read() tags = jieba.analyse.extract_tags(text, topK=30) loadbookbase() tags = stopwordsfilter.stopwordsfilter(tags) # start matching for bookline in bookbase: # tagline is a set of tags for notes ismatch = 0 matchtags = "" for tag in tags: # tag is one of the tags for testnote for mtag in bookline: # mtag is one of the tags in one file of the tagbase if tag == mtag: # if there is a tag same as the testnote's tag, we add it to the list ismatch = 1 matchtags = matchtags + tag + " " break if ismatch == 1: matchbooks.append(bookline[0] + " " + matchtags) print "Original Notes" print note loglines.append("Original Notes: " + note + "\n") print "Related Books:" loglines.append("Related Books:" + "\n") for book in matchbooks: loglines.append(book + "\n") print book loglines.append("-------------------")
def findsimilarity(note): matchbooks = [] h = codecs.open(note, 'r') text = h.read() tags = jieba.analyse.extract_tags(text, topK=30) loadbookbase() tags = stopwordsfilter.stopwordsfilter(tags) # start matching for bookline in bookbase: #tagline is a set of tags for notes ismatch = 0 matchtags = '' for tag in tags: # tag is one of the tags for testnote for mtag in bookline: # mtag is one of the tags in one file of the tagbase if tag == mtag: # if there is a tag same as the testnote's tag, we add it to the list ismatch = 1 matchtags = matchtags + tag + " " break if ismatch == 1: matchbooks.append(bookline[0] + " " + matchtags) print 'Original Notes' print note loglines.append('Original Notes: ' + note + '\n') print 'Related Books:' loglines.append('Related Books:' + '\n') for book in matchbooks: loglines.append(book + '\n') print book loglines.append('-------------------')
def findsimilarity(note): matchnotes = [] h = codecs.open(note, 'r') text = h.read() tags = jieba.analyse.extract_tags(text, topK=30) loadtagbase() ismatch = 0 tags = stopwordsfilter.stopwordsfilter(tags) # start matching for tagline in tagbase: #tagline is a set of tags for notes ismatch = 0 matchtags = '' for tag in tags: # tag is one of the tags for testnote for mtag in tagline: # mtag is one of the tags in one file of the tagbase if tag == mtag: # if there is a tag same as the testnote's tag, we add it to the list ismatch = 1 matchtags = matchtags + tag + " " break; if ismatch == 1: matchnotes.append(tagline[0] + " " + matchtags) print 'Original Notes' print note loglines.append('Original Notes: ' + note + '\n') print 'Similar Notes:' loglines.append('Related Notes:' + '\n') for nt in matchnotes: loglines.append(nt + '\n') print nt loglines.append('-------------------')
def noteproc(): files = os.listdir(noteroot) taglist = [] for f in files: if f[0] == '.': continue if os.path.isdir(noteroot + '/' + f): continue # load note h = codecs.open(noteroot + '/' + f, 'r') text = h.read() # clean timestamp in notes -> need to be a function # TODO eliminate ![]() tag #text = re.sub(r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d','', text) #text = re.sub(r'\d\d\d\d-\d\d-\d\d','', text) # clean markdown syntax text = re.sub(r'!\[.*\]\(.*\)', '', text) text = re.sub(r'\(http.*\)', '', text) text = re.sub(r'#+', '', text) # get top 15 tags with weights tags = jieba.analyse.extract_tags(text, topK=30, withWeight=False) # store to a list output = f print f, tags = stopwordsfilter.stopwordsfilter(tags) for t in tags: print ' ' + t, output = output + u' ' + t print '' taglist.append(output + '\n') fh = codecs.open('./seg/tags.txt', 'w', 'utf-8') for t in taglist: fh.write(t) print 'done'
def noteproc(): files = os.listdir(noteroot) taglist = [] for f in files: if f[0] == '.': continue if os.path.isdir(noteroot+ '/' + f): continue # load note h = codecs.open(noteroot+ '/' + f, 'r') text = h.read() # clean timestamp in notes -> need to be a function # TODO eliminate ![]() tag #text = re.sub(r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d','', text) #text = re.sub(r'\d\d\d\d-\d\d-\d\d','', text) # clean markdown syntax text = re.sub(r'!\[.*\]\(.*\)','',text) text = re.sub(r'\(http.*\)','',text) text = re.sub(r'#+','', text) # get top 15 tags with weights tags = jieba.analyse.extract_tags(text, topK=30, withWeight=False) # store to a list output = f print f, tags = stopwordsfilter.stopwordsfilter(tags) for t in tags: print ' ' + t, output = output + u' ' + t print '' taglist.append(output + '\n') fh = codecs.open('./seg/tags.txt', 'w', 'utf-8') for t in taglist: fh.write(t) print 'done'