print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) print "Lucene Search Init Done..." #------------- Divde Word Init ----------------- divide = cdll.LoadLibrary("D:\\ICTCLAS\\ICTCLAS50.dll") IfInit = divide.ICTCLAS_Init(c_char_p("D:\\ICTCLAS")) wordcount = divide.ICTCLAS_ImportUserDictFile('D:\\ICTCLAS\\userdict.txt', "CODE_TYPE_UTF8") print "Divde Word Init Done..." print #------------- Net Word Generate ----------------- SearchFiles_pylucene.run(searcher, analyzer, input, filepath) topWordLev1 = [] # top 20 word topWordLev2 = [] wordNet = collections.OrderedDict() topWordLev1 = WordDoc.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 20, stockcodeflag) print "***************** Lev1 Word Net " + input + " Generated *********************" # os.system("pause") print for wordLev1 in topWordLev1: SearchFiles_pylucene.run(searcher, analyzer, wordLev1[0].decode('utf8').encode('gbk'), filepath) topWordLev2 = WordDoc.run(divide, filepath, startDate, endDate, wordLev1[0].decode('utf8').encode('gbk'),
#try: urlnews = str(newsHref[count]) title = str(newsTitle[count].encode('gbk')) newstime = datetime.datetime.today().strftime('%Y%m%d') #cur.execute("insert into RollNews values('%s','%s','%s')"%(urlnews, title, newstime)) print "insert " + urlnews + " into database..." conn.commit() except Exception, e: print e #continue print newsTitle[count].encode('gbk') #------------- Net Word Generate ----------------- if input != '': SearchFiles_pylucene.run(searcher, analyzer, input, filepath) topWordLev1 = [] # top 20 word topWordLev1 = WordDoc_stock.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 30, stockcodeflag) #topWordLev1 = WordDoc.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 30, stockcodeflag) print "***************** Lev1 Word Net " + input + " Generated *********************" # os.system("pause") # print filecname = open("D:\\ICTCLAS\\cname_20120718.txt", 'r') stockName = {} for line in filecname: stockName[line[7:].strip()] = line[:6] filecname.close()
initVM(maxheap='512m') print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) print "Lucene Search Init Done..." #------------- Divde Word Init ----------------- divide = cdll.LoadLibrary("D:\\ICTCLAS\\ICTCLAS50.dll") IfInit = divide.ICTCLAS_Init(c_char_p("D:\\ICTCLAS")) wordcount = divide.ICTCLAS_ImportUserDictFile('D:\\ICTCLAS\\userdict.txt',"CODE_TYPE_UTF8") print "Divde Word Init Done..." print #------------- Net Word Generate ----------------- SearchFiles_pylucene.run(searcher, analyzer, input, filepath) topWordLev1 = [] # top 20 word topWordLev2 = [] wordNet = collections.OrderedDict() topWordLev1 = WordDoc.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 20, stockcodeflag) print "***************** Lev1 Word Net " + input + " Generated *********************" # os.system("pause") print for wordLev1 in topWordLev1: SearchFiles_pylucene.run(searcher, analyzer, wordLev1[0].decode('utf8').encode('gbk'), filepath) topWordLev2 = WordDoc.run(divide, filepath, startDate, endDate, wordLev1[0].decode('utf8').encode('gbk'), idfpath, totalfile, 5, stockcodeflag) wordNet[wordLev1] = topWordLev2 print "------------- Lev2 Word Net " + wordLev1[0].decode('utf8').encode('gbk') + "[" + str(wordLev1[1]) + "] Generated -------------" print # os.system("pause")
#try: urlnews = str(newsHref[count]) title = str(newsTitle[count].encode('gbk')) newstime = datetime.datetime.today().strftime('%Y%m%d') #cur.execute("insert into RollNews values('%s','%s','%s')"%(urlnews, title, newstime)) print "insert " + urlnews + " into database..." conn.commit() except Exception,e: print e #continue print newsTitle[count].encode('gbk') #------------- Net Word Generate ----------------- if input != '': SearchFiles_pylucene.run(searcher, analyzer, input, filepath) topWordLev1 = [] # top 20 word topWordLev1 = WordDoc_stock.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 30, stockcodeflag) #topWordLev1 = WordDoc.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 30, stockcodeflag) print "***************** Lev1 Word Net " + input + " Generated *********************" # os.system("pause") # print filecname = open("D:\\ICTCLAS\\cname_20120718.txt", 'r') stockName = {} for line in filecname: stockName[line[7:].strip()] = line[:6] filecname.close() if len(topWordLev1) != 0: outputfile = "D:\\TotalCode\\LuceneCode\\WordNet\\WordNet_stock\\"+sourcedata+"_"+input+"_"+startDate.strftime('%Y%m%d')+"-"+endDate.strftime('%Y%m%d')+".txt"