Exemplo n.º 1
0
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    print "Lucene Search Init Done..."

    #------------- Divde Word Init -----------------
    divide = cdll.LoadLibrary("D:\\ICTCLAS\\ICTCLAS50.dll")
    IfInit = divide.ICTCLAS_Init(c_char_p("D:\\ICTCLAS"))
    wordcount = divide.ICTCLAS_ImportUserDictFile('D:\\ICTCLAS\\userdict.txt',
                                                  "CODE_TYPE_UTF8")
    print "Divde Word Init Done..."
    print

    #------------- Net Word Generate -----------------
    SearchFiles_pylucene.run(searcher, analyzer, input, filepath)
    topWordLev1 = []  # top 20 word
    topWordLev2 = []
    wordNet = collections.OrderedDict()
    topWordLev1 = WordDoc.run(divide, filepath, startDate, endDate, input,
                              idfpath, totalfile, 20, stockcodeflag)
    print "***************** Lev1 Word Net " + input + " Generated *********************"
    #  os.system("pause")
    print

    for wordLev1 in topWordLev1:
        SearchFiles_pylucene.run(searcher, analyzer,
                                 wordLev1[0].decode('utf8').encode('gbk'),
                                 filepath)
        topWordLev2 = WordDoc.run(divide, filepath, startDate, endDate,
                                  wordLev1[0].decode('utf8').encode('gbk'),
Exemplo n.º 2
0
                #try:
                urlnews = str(newsHref[count])
                title = str(newsTitle[count].encode('gbk'))
                newstime = datetime.datetime.today().strftime('%Y%m%d')
                #cur.execute("insert into RollNews values('%s','%s','%s')"%(urlnews, title, newstime))
                print "insert " + urlnews + " into database..."
                conn.commit()
            except Exception, e:
                print e
#continue

            print newsTitle[count].encode('gbk')

            #------------- Net Word Generate -----------------
            if input != '':
                SearchFiles_pylucene.run(searcher, analyzer, input, filepath)
                topWordLev1 = []  # top 20 word
                topWordLev1 = WordDoc_stock.run(divide, filepath, startDate,
                                                endDate, input, idfpath,
                                                totalfile, 30, stockcodeflag)
                #topWordLev1 = WordDoc.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 30, stockcodeflag)
                print "***************** Lev1 Word Net " + input + " Generated *********************"
                #        os.system("pause")
                #        print

                filecname = open("D:\\ICTCLAS\\cname_20120718.txt", 'r')
                stockName = {}
                for line in filecname:
                    stockName[line[7:].strip()] = line[:6]
                filecname.close()
Exemplo n.º 3
0
  initVM(maxheap='512m')
  print 'lucene', VERSION
  directory = SimpleFSDirectory(File(STORE_DIR))
  searcher = IndexSearcher(directory, True)
  analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
  print "Lucene Search Init Done..."

#------------- Divde Word Init -----------------
  divide = cdll.LoadLibrary("D:\\ICTCLAS\\ICTCLAS50.dll")
  IfInit = divide.ICTCLAS_Init(c_char_p("D:\\ICTCLAS"))
  wordcount = divide.ICTCLAS_ImportUserDictFile('D:\\ICTCLAS\\userdict.txt',"CODE_TYPE_UTF8")
  print "Divde Word Init Done..."
  print

#------------- Net Word Generate -----------------
  SearchFiles_pylucene.run(searcher, analyzer, input, filepath) 
  topWordLev1 = [] # top 20 word
  topWordLev2 = []
  wordNet = collections.OrderedDict()
  topWordLev1 = WordDoc.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 20, stockcodeflag)
  print "***************** Lev1 Word Net " + input + " Generated *********************"
#  os.system("pause")
  print

  for wordLev1 in topWordLev1:
    SearchFiles_pylucene.run(searcher, analyzer, wordLev1[0].decode('utf8').encode('gbk'), filepath)
    topWordLev2 = WordDoc.run(divide, filepath, startDate, endDate, wordLev1[0].decode('utf8').encode('gbk'), idfpath, totalfile, 5, stockcodeflag)
    wordNet[wordLev1] = topWordLev2
    print "------------- Lev2 Word Net " + wordLev1[0].decode('utf8').encode('gbk') + "[" + str(wordLev1[1]) + "] Generated -------------"
    print
#  os.system("pause")
Exemplo n.º 4
0
      #try:
        urlnews = str(newsHref[count])
        title = str(newsTitle[count].encode('gbk'))
        newstime = datetime.datetime.today().strftime('%Y%m%d')
        #cur.execute("insert into RollNews values('%s','%s','%s')"%(urlnews, title, newstime))
        print "insert " + urlnews + " into database..."
        conn.commit()
      except Exception,e:
        print e
	#continue

      print newsTitle[count].encode('gbk')

#------------- Net Word Generate -----------------
      if input != '':
        SearchFiles_pylucene.run(searcher, analyzer, input, filepath) 
        topWordLev1 = [] # top 20 word
        topWordLev1 = WordDoc_stock.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 30, stockcodeflag)
        #topWordLev1 = WordDoc.run(divide, filepath, startDate, endDate, input, idfpath, totalfile, 30, stockcodeflag)
        print "***************** Lev1 Word Net " + input + " Generated *********************"
#        os.system("pause")
#        print

        filecname = open("D:\\ICTCLAS\\cname_20120718.txt", 'r')
        stockName = {}
        for line in filecname:
          stockName[line[7:].strip()] = line[:6]
        filecname.close()

        if len(topWordLev1) != 0:
	  outputfile = "D:\\TotalCode\\LuceneCode\\WordNet\\WordNet_stock\\"+sourcedata+"_"+input+"_"+startDate.strftime('%Y%m%d')+"-"+endDate.strftime('%Y%m%d')+".txt"