Пример #1
0
    def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info):
        """
            Add a document to the index. Does this using direct Lucene access.

            :param new_doc: dict of fields with values
            :type new_doc:dict
            :param metadata: ditto
            :type metadata:dict
            :param fields_to_process: only add these fields from the doc dict
            :type fields_to_process:list
        """
        doc = Document()
        total_numTerms=bow_info["total_numterms"]
        # each BOW now comes with its field
        for field in fields_to_process:
            field_object=Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)
##            boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            boost=1 / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            field_object.setBoost(float(boost))
            doc.add(field_object)

        json_metadata=json.dumps(metadata)
        doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO))
        doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
        doc.add(Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
Пример #2
0
    def add_article(self, article):
        # constructing a document
        doc = Document()

        title = Field('title', article.title, Field.Store.YES, Field.Index.ANALYZED)
        title.setBoost(10.0)
        doc.add(title)

        description = Field('description', article.description, Field.Store.YES, Field.Index.ANALYZED)
        description.setBoost(5.0)
        doc.add(description)

        doc.add(Field('keywords', article.keywords, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', article.content, Field.Store.YES, Field.Index.ANALYZED))
        if article.date:
            doc.add(Field('date', article.date, Field.Store.YES, Field.Index.NOT_ANALYZED))
        if article.last_modified:
            doc.add(Field('last_modified', article.last_modified, Field.Store.YES, Field.Index.NOT_ANALYZED))
        if article.images:
            doc.add(Field('image_url', article.images[0][0], Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field('image_text', article.images[0][1], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field('url', article.url, Field.Store.YES, Field.Index.NOT_ANALYZED))

        # creates document or updates if already exists
        self.writer.updateDocument(Term("url", article.url), doc)
Пример #3
0
    def addDocument(self, writer, new_doc, metadata, fields_to_process,
                    bow_info):
        """
            Add a document to the index. Does this using direct Lucene access.

            :param new_doc: dict of fields with values
            :type new_doc:dict
            :param metadata: ditto
            :type metadata:dict
            :param fields_to_process: only add these fields from the doc dict
            :type fields_to_process:list
        """
        doc = Document()
        total_numTerms = bow_info["total_numterms"]
        # each BOW now comes with its field
        for field in fields_to_process:
            field_object = Field(field, new_doc[field], Field.Store.NO,
                                 Field.Index.ANALYZED, Field.TermVector.YES)
            ##            boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            boost = 1 / float(
                math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            field_object.setBoost(float(boost))
            doc.add(field_object)

        json_metadata = json.dumps(metadata)
        doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(
            Field("bow_info", json.dumps(bow_info), Field.Store.YES,
                  Field.Index.NO))
        doc.add(
            Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("year_from", metadata["year"], Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
Пример #4
0
    def testDocBoost(self):

        writer = self.getWriter(
            analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT))

        f1 = Field("field", "word", TextField.TYPE_STORED)
        f2 = Field("field", "word", TextField.TYPE_STORED)
        f2.setBoost(2.0)

        d1 = Document()
        d2 = Document()

        d1.add(f1)  # boost = 1
        d2.add(f2)  # boost = 2

        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.close()

        scores = [0.0] * 2

        class collector(PythonCollector):
            def __init__(_self, scores):
                super(collector, _self).__init__()
                _self.scores = scores
                _self.base = 0

            def collect(_self, doc, score):
                _self.scores[doc + _self.base] = score

            def setNextReader(_self, context):
                _self.base = context.docBase

            def acceptsDocsOutOfOrder(_self):
                return True

        self.getSearcher().search(TermQuery(Term("field", "word")),
                                  collector(scores))

        lastScore = 0.0
        for score in scores:
            self.assert_(score > lastScore)
            lastScore = score
    def testDocBoost(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT))
    
        f1 = Field("field", "word", TextField.TYPE_STORED)
        f2 = Field("field", "word", TextField.TYPE_STORED)
        f2.setBoost(2.0)
    
        d1 = Document()
        d2 = Document()
    
        d1.add(f1)                                 # boost = 1
        d2.add(f2)                                 # boost = 2
    
        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.close()

        scores = [0.0] * 2

        class collector(PythonCollector):
            def __init__(_self, scores):
                super(collector, _self).__init__()
                _self.scores = scores
                _self.base = 0
            def collect(_self, doc, score):
                _self.scores[doc + _self.base] = score
            def setNextReader(_self, context):
                _self.base = context.docBase
            def acceptsDocsOutOfOrder(_self):
                return True

        self.getSearcher().search(TermQuery(Term("field", "word")),
                                  collector(scores))
    
        lastScore = 0.0
        for score in scores:
            self.assert_(score > lastScore)
            lastScore = score
Пример #6
0
def create_index(index) :
	indexDir = SimpleFSDirectory(File(index))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open('f:/nlp/data/questions/combine.txt')
	for line in f :
		line = get_data_from_text(line.decode('utf-8'))
		doc = Document()
		field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED)
		field.setBoost(2.0)
		doc.add(field)
		writer.addDocument(doc)
	
	print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
Пример #7
0
    def indexTable(self, writer):

        #connection 
        con = None

        #define the index of all the fields
        #---------step 2:connect to mysql----------
        con = mdb.connect('localhost','root','testgce','douban_movie_v3')

        #t_num = FieldType.NumericType it is wrong!!
        t_num = FieldType()
        t_num.setStored(False)

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        maxDict = utils.maxDict
        #加权数值范围
        base = DOC_BOOST_RANGE[0]
        upper = DOC_BOOST_RANGE[1]

        with con:
            # Careful with codecs
            con.set_character_set('utf8')

            cur = con.cursor()
            # Aagin the codecs
            cur.execute('SET NAMES utf8;')
            cur.execute('SET CHARACTER SET utf8;')
            cur.execute('SET character_set_connection=utf8;')
            
            #------step 3: choose the right table------
            cur.execute("SELECT * FROM movie_items")

            numrows = int(cur.rowcount)
            print 'numrows:',numrows
            for i in range(numrows):
                print
                row = cur.fetchone()

                #------step 4:Index your field------
                summary = row[SUMMARY]  
                subject_id = row[SUBJECT_ID]


                print 'id'+subject_id
                year = utils.formatYear(row[YEAR])
                try:
                    date = DateTools.stringToDate(year.replace('-',' '))
                    wtfFile = open('wtf.txt','a')
                    dateStr  = DateTools.dateToString(date,DateTools.Resolution.DAY)
                except:
                    wtfFile.write(year+'\n')

                        

                doc = Document()

                #boosting
                boostProb = utils.calcBoostProb(row,maxDict,dateStr)
                boost = base + boostProb*(upper-base)

                doc.add(FloatField("boost",boost,Field.Store.YES))
                doc.add(StringField("year",dateStr,Field.Store.YES))
                print 'dateStr:'+dateStr
                #A text field is a sequence of terms that has been tokenized while a string field is a single term (although it can also be multivalued.)

                do_count = row[DO_COUNT] if row[DO_COUNT] != None else 0
                wish_count = row[COLLECT_COUNT] if row[WISH_COUNT] != None else 0

                #fields which should not be analyzed
                doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.YES))
                doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.YES))
                doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.YES))
                #doc.add(FloatField("year", float(row[YEAR]), Field.Store.YES).setBoost(boost))
                doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.YES))
                doc.add(IntField("do_count", int(do_count), Field.Store.YES))
                doc.add(IntField("wish_count", int(wish_count), Field.Store.YES))
                doc.add(IntField("subject_id", int(row[SUBJECT_ID]), Field.Store.YES))
                doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.YES))
                doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.YES))
                doc.add(StringField("image_small", row[IMAGE_SMALL], Field.Store.YES))

                #fields which should be analyzed with WhitespaceAnalyzer
                #attention!!! dont use a long sentence like :
                #doc.add(Field("genres",    row[GENRES].replace(delim,' '),    t3).setBoost(boost))
                #or you'll get a null pointer error
                f = Field("countries", row[COUNTRIES].replace(delim,' '), t3)
                f.setBoost(boost)
                doc.add(f)

                #process casts
                raw_casts = row[CASTS].replace(delim,' ')
                f = Field("raw_casts", raw_casts , t1)
                f.setBoost(boost)
                doc.add(f)

                #将英文人名中的 ·
                raw_casts = raw_casts.replace('·',' ')
                
                if len(raw_casts.split(' '))<CASTS_LEN:
                    #平局人名长度是4
                    casts = raw_casts + ' ¥¥¥¥'*(CASTS_LEN-len(raw_casts.split(' ')))
                f = Field("casts", casts , t3)
                f.setBoost(boost)
                doc.add(f)

                #process directors
                raw_directors = row[DIRECTORS].replace(delim,' ')
                f = Field("raw_directors",raw_directors, t1)
                f.setBoost(boost)
                doc.add(f)

                #将英文人名中的 · 替换
                raw_directors = raw_directors.replace('·',' ')

                if len(raw_directors.split(' '))<DIRECTORS_LEN:
                    #平局人名长度是4
                    directors = raw_directors + ' ¥¥¥¥'*(DIRECTORS_LEN-len(raw_directors.split(' ')))
                f = Field("directors", directors, t3)
                f.setBoost(boost)
                doc.add(f)

                Field("genres",    row[GENRES].replace(delim,' '),    t3)
                f.setBoost(boost)
                doc.add(f)

                Field("subtype",   row[SUBTYPE].replace(delim,' '),   t3)
                f.setBoost(boost)
                doc.add(f)

                #it is wrong cause indexable field has no method setBoost
                # fieldList = doc.getFields()  # is not a python 'list' , but a 'List' which is unindexable                
                # for eachField in fieldList:
                #     eachField.setBoost(boost)


                #user_tags 原始字符串要存,reRank要用:
                doc.add(StringField("raw_user_tags",row[USER_TAGS],Field.Store.YES))
                doc.add(StringField("raw_others_like",row[OTHERS_LIKE],Field.Store.YES))
                

                user_tags_str = ''
                others_like_str = ''
                tags_len = 0
                

                if row[USER_TAGS]!='':
                    user_tags_list = row[USER_TAGS].split(delim) 
                    for tag_pair in user_tags_list:
                        if tag_pair!='':#字符串的最后一个字符是¥,这样split之后最后一个元素是空字符
                            #print 'tag_pair'+tag_pair+'hhe'
                            tag_name = tag_pair.split(delim_uo)[0]+' ' # dont forget this space !!
                            tag_num = tag_pair.split(delim_uo)[1]
                            tag_num_processed = int(int(tag_num)/TAG_SPAN)+1 #最小为1
                            #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                            user_tags_str = user_tags_str +' '+ tag_name * tag_num_processed
                            tags_len = tags_len + tag_num_processed #最后得到总共词的个数


                if tags_len<TAGS_AVER_LEN:
                    #填充tags,目测3是平均长度,所以使用 ¥¥¥
                    user_tags_str = user_tags_str +' ¥¥¥'*(TAGS_AVER_LEN - tags_len)
                #


                if row[OTHERS_LIKE]!='':
                    for like_pair in row[OTHERS_LIKE].split(delim):
                        if like_pair!='':
                            others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1]


                #start process adjs
                if row[ADJS] != None:
                    raw_adjs = row[ADJS][:-1]

                    adjs_str = ''
                    adjs_len = 0
                    if row[ADJS] != '' and row[ADJS] != '\n':
                        #'重要=4.0,特殊=4.0'
                        adjs_str = row[ADJS]
                        adjs_list = adjs_str.split(',')
                        for adj_pair in adjs_list:
                            #print 'adj_pair:'+adj_pair+'hhe'
                            adj_name = adj_pair.split('=')[0]
                            adj_num = adj_pair.split('=')[1]

                            #去换行符,转换int
                            if adj_num[-1] == '\n':
                                adj_num = adj_num[0:-1]
                            adj_num = int(float(adj_num))

                            add_adj=''
                            # #同义词
                            # adj_name_bro = searchDictValue(adjMap,adj_name)
                            # if adj_name_bro == -1: #表示没有结果,即未找到近义词,不添加
                            #     add_adj = ''
                            # else:
                            #     add_adj = (adj_name_bro+' ')*adj_num
                            #     raw_adjs = raw_adjs + ',' + adj_name_bro+'='+str(adj_num)
                                
                            adjs_str = adjs_str + ' ' + (adj_name+' ') * adj_num +add_adj
                            adjs_len = adjs_len + adj_num #最后得到总共tags的个数

                    #print raw_adjs
                    doc.add(StringField("raw_adjs",raw_adjs,Field.Store.YES))

                    if adjs_len<ADJS_AVER_LEN:
                        #填充 adjs_str,目测2是平均长度,所以使用 "¥¥"
                        adjs_str = adjs_str +' ¥¥'*(ADJS_AVER_LEN - adjs_len)

                    f = Field("adjs", adjs_str, t3)
                    f.setBoost(boost)
                    doc.add(f)

                f = Field("user_tags", user_tags_str, t3)
                f.setBoost(boost)
                doc.add(f)

                f = Field("others_like", others_like_str, t3)
                f.setBoost(boost)
                doc.add(f)



                #fields which should be analyzed with good analyzer
                f = Field("title", row[TITLE], t3)                
                f.setBoost(boost)
                doc.add(f)

                f = Field("original_title", row[ORIGINAL_TITLE], t3)
                f.setBoost(boost)
                doc.add(f)

                f = Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t3)
                f.setBoost(boost)
                doc.add(f)

                f = Field("aka", row[AKA], t2)
                f.setBoost(boost)
                doc.add(f)

                if len(summary) > 0:
                    print subject_id +'--->'+':\n    '+ row[TITLE]
                    try:
                        summary_unicoded = unicode(summary, 'utf-8') #test the encoding 
                    except Exception,e:
                        print "Decode Failed: ", e
                    f = Field('summary', summary, t2)
                    f.setBoost(boost)
                    doc.add(f)
                else:
                    print "warning:\n" + subject_id +'---> No content!'
                print 'boosting:' + str(boost)

                #for debug
                if boost>upper:
                    print boostProb
                    print maxDict
                    
                    exit(0)

                writer.addDocument(doc)
Пример #8
0
def main(use_elasticsearch=True, calculate_PageRank=False, tele_const=0.2):
    """
    main entry for the indexer module.
    """
    jsons_root_dir = 'JSONs/'

    # list of addresses of all json files
    all_json_dirs = glob.glob(jsons_root_dir + '*.json')

    # first reading all json files
    jsons = []
    for jdir in all_json_dirs:
        with open(jdir, 'r') as f:
            jsn = json.load(f)
            jsons.append(jsn)
    print len(jsons), ' json files imported.'

    # now creating a set of all links and then a list of all links in json files
    print 'creating a list of all links'
    links_set = set()
    for js in jsons:
        links_set.add(js["url"])
        for l in js["outlinks"]:
            links_set.add(l)
    print len(links_set), ' links found'
    links = list(links_set)

    ## if user has selected to index documents using Elasticsearch
    # Note that when using Elasticsearch, page rank is ignored
    if use_elasticsearch:
        from elasticsearch import Elasticsearch
        from elasticsearch_dsl import Search, document, field, connections, Q
        from elasticsearch_dsl.connections import connections

        print 'Using Elasticsearch for indexing, PageRank is ignored'
        es = Elasticsearch()
        es.indices.create(index='book-index', ignore=[400, 404])
        connections.create_connection(hosts=['localhost'], timeout=20)
        connections.add_connection('book', es)
        Book.init('book-index')

        ## adding all document to the index 'book-index'
        for idx, js in enumerate(jsons):
            book = Book(average=js['average'],
                        cover=js['cover'],
                        description=js['description'].encode('utf-8'),
                        ratings=js['ratings'],
                        reviews=js['reviews'],
                        title=js['title'],
                        url=js['url'],
                        outlinks=js['outlinks'])
            book.add_authors(js['authors'])
            book.add_userreviews(js['userreviews'])
            book.id = idx
            book.save()
        print 'Elasticsearch index created'

    ### use pyLucene instead
    else:
        import lucene
        from java.io import File
        from org.apache.lucene.index import IndexWriterConfig, IndexWriter, FieldInfo
        from org.apache.lucene.document import Document, Field, FieldType, IntField, FloatField
        from org.apache.lucene.store import SimpleFSDirectory
        from org.apache.lucene.util import Version
        from org.apache.lucene.analysis.standard import StandardAnalyzer

        print 'Using Lucene for indexing'
        ## if user has selected to calculate the PageRank
        if calculate_PageRank:
            # now creating the unnormalized adjacency matrix
            print 'creating the unnormalized adjacency matrix.'
            adjacency = np.zeros((len(links_set), len(links_set)))
            for js in jsons:
                node_idx = links.index(js["url"])
                for l in js["outlinks"]:
                    out_idx = links.index(l)
                    adjacency[node_idx, out_idx] += 1
            print 'the unnormalized adjacency matrix created.'

            print 'normalizing the adjacency matrix with teleporting constant value of ', tele_const
            norm_mat = Normalize(adjacency, tele_const)
            print 'calculating the PageRank scores'
            pr_scores = PageRankScore(norm_mat)

        ## here goes the pyLucene code, which means I should swith to the damn Ubuntu
        index_folder = '.'
        index_name = 'lucene.index'
        index_path = os.path.join(index_folder, index_name)
        print 'initializing Lucene VM'
        lucene.initVM()
        print 'lucene version ', lucene.VERSION
        version = Version.LUCENE_CURRENT
        index_store = SimpleFSDirectory(File(index_path))
        analyzer = StandardAnalyzer(version)
        config = IndexWriterConfig(version, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(index_store, config)

        # Options
        TokenizeFields = True

        # Title field type
        title_field = 'title'
        tft = FieldType()
        tft.setIndexed(True)
        tft.setStored(True)
        tft.setTokenized(TokenizeFields)
        tft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS
                            )  #only index the document and frequency data

        # Authors name field type
        authors_name_field = 'authors_name'
        anft = FieldType()
        anft.setIndexed(True)
        anft.setStored(True)
        anft.setTokenized(TokenizeFields)
        anft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        # Authors url field type
        authors_url_field = 'authors_url'
        auft = FieldType()
        auft.setIndexed(False)
        auft.setStored(True)

        # Average rating field type
        average_field = 'average'

        # Cover Image URL field type
        cover_field = 'cover'
        cft = FieldType()
        cft.setIndexed(False)
        cft.setStored(True)

        # Book description field type
        description_field = 'description'
        descft = FieldType()
        descft.setIndexed(True)
        descft.setStored(True)
        descft.setTokenized(TokenizeFields)
        descft.setIndexOptions(
            FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        # Outlinks field type
        outlinks_field = "outlinks"
        outft = FieldType()
        outft.setIndexed(False)
        outft.setStored(True)

        # Ratings count field type
        ratings_field = 'ratings'

        # Reviews count field type
        reviews_field = 'reviews'

        # URL field type
        url_field = 'url'
        uft = FieldType()
        uft.setIndexed(False)
        uft.setStored(True)

        # userreviews.userName field type
        userreviews_userName_field = 'userreviews_userName'
        usunft = FieldType()
        usunft.setIndexed(False)
        usunft.setStored(True)

        #userreviews.userReview field type
        userreviews_userReview_field = 'userreviews_userReview'
        usurft = FieldType()
        usurft.setIndexed(True)
        usurft.setStored(False)
        usurft.setTokenized(TokenizeFields)
        usurft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        #userreviews.userReviewDate field type
        userreviews_userReviewDate_field = 'userreviews_userReviewDate'
        usudft = FieldType()
        usudft.setIndexed(False)
        usudft.setStored(True)

        #userreviews.userURL field type
        userreviews_userURL_field = 'userreviews_userURL'
        usuuft = FieldType()
        usuuft.setIndexed(False)
        usuuft.setStored(True)

        docid_field = 'docid'

        for idx, js in enumerate(jsons):
            boostVal = js['average']
            if calculate_PageRank:
                boostVal *= pr_scores[links.index(js['url'])]
            doc = Document()
            for author in js['authors']:
                doc.add(Field(authors_name_field, author['name'], anft))
                doc.add(Field(authors_url_field, author['url'], auft))
            doc.add(
                FloatField(average_field, float(js['average']),
                           Field.Store.YES))
            doc.add(Field(cover_field, js['cover'], cft))
            df = Field(description_field, js['description'], descft)
            df.setBoost(boostVal)
            doc.add(df)
            for u in js['outlinks']:
                doc.add(Field(outlinks_field, u, outft))
            doc.add(IntField(ratings_field, js['ratings'], Field.Store.YES))
            doc.add(IntField(reviews_field, js['reviews'], Field.Store.YES))
            tf = Field(title_field, js['title'], tft)
            tf.setBoost(boostVal)
            doc.add(tf)
            doc.add(Field(url_field, js['url'], uft))

            for rev in js['userreviews']:
                doc.add(
                    Field(userreviews_userName_field, rev['userName'], usunft))
                doc.add(
                    Field(userreviews_userReview_field, rev['userReview'],
                          usurft))
                doc.add(
                    Field(userreviews_userReviewDate_field,
                          rev['userReviewDate'], usurft))
                doc.add(
                    Field(userreviews_userURL_field, rev['userURL'], usuuft))
            doc.add(IntField(docid_field, idx, Field.Store.YES))

            writer.addDocument(doc)
        print 'lucene index created'
        writer.commit()
        writer.close()
        print 'writing lucene indexing finished'
	def reindex(self):
		''' Re-indexes the entire database into Index file'''
		start = time.time()

		# get all posts
		posts = self._tuples_to_dict(self._fetch_all_questions(), self._posts_fields)
		if not posts:
			raise Exception("FATAL Error: Could not fetch posts from Database")

		# open indexer
		# lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		# print 'lucene', lucene.VERSION

		store = SimpleFSDirectory(File(self.index_dir))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer = IndexWriter(store, config)

		indexedField = FieldType()
		indexedField.setIndexed(True)
		indexedField.setStored(True)
		indexedField.setTokenized(True)
		indexedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

		storedField = FieldType()
		storedField.setIndexed(False)
		storedField.setStored(True)
		storedField.setTokenized(False)
		storedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

		fieldTypes = {
						'type'		: storedField,
						'id'		: storedField,
						'title'		: indexedField,
						'question'	: indexedField,
						'answer'	: indexedField,
						# 'comment'	: indexedField,
						'tag'		: indexedField,
						'extra'		: indexedField,
		}

		# get their comments
		num_docs = 0
		for post in posts:
			if self.status_mode: print "\r {0:.2f} %complete".format(((num_docs/142627.0)*100)),
			if self.debug : print "\n","*"*20,"\nIndexing post: ", post['id'], "from ", post['extra']
			if self.debug and self.verbose_values: print post
			answers = self._tuples_to_dict(self._fetch_all_answers(post['id'], post['extra']), self._answer_fields)


			# add comment field
			for answer in answers:
				num_docs += 1
				if self.debug: print "\n","+"*10, "\nMaking new Document"
				doc = Document()
				if self.debug: print "Adding doc type"
				doc.add(Field("type", self.doctype, fieldTypes['type']))
				
				# make fields
				if self.debug: print "Adding post fields"
				for i in xrange(len(self._posts_fields)):
					f = Field(self._posts_fields[i], self._cleanup_tag(post[self._posts_fields[i]]), fieldTypes[self._posts_fields[i]])
					f.setBoost(self._fields_boost[self._posts_fields[i]])
					doc.add(f)


				if self.status_mode: print "\t Indexing answer: ", answer['answer_id']
				if self.debug and self.verbose_values: print answer
				# answered_doc = copy.deepcopy(doc)
				# make comment field
				f = Field("answer", self._cleanup_tag(answer['answer']), fieldTypes['answer'])
				f.setBoost(self._fields_boost['answer'])
				doc.add(f)
				# calculate paths
				# commented_doc = copy.deepcopy(answered_doc)
				# comments = self._comments_to_comment_string(self._tuples_to_dict(self._fetch_all_comments(answer['id']), self._comment_fields))

				# if self.debug: print "\t\tAdding comments: ", comments
				# commented_doc.add(Field("comment", self._cleanup_tag(comments), fieldTypes['comment']))

				# write index
				if self.debug: print "\tAdding document {doc_id} to index".format(doc_id=post['id'])
				writer.addDocument(doc)

				# del answered_doc
				# del commented_doc

			if self.debug: print "Commiting document to index"
			writer.commit()

		# close index
		if self.status_mode: print "Closing index write"
		writer.close()
		end = time.time() - start

		if self.status_mode: print "\n","-"*20, \
			"\nTotal time spent in indexing: ", end, "seconds" \
			"\nIndexed {num_docs} documents".format(num_docs=num_docs)
Пример #10
0
    def indexDocs(self, root, writer):
        global countAll

        print 'indexDocs working'

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    string = file.read()
                    file.close()
                    if string == '':
                        continue

                    name, img, ingredient, taste, tech, others, content = string.split("|",6)
                    if len(img) < 55 and len(img) != 40:
                        continue
                    
                    countAll+=1
                    print "adding", filename

                    tmplist = content.split('|')
                    url = tmplist[-1]
                    content = "".join(tmplist[:-1])

                    # content = content.decode('utf-8')
                    # seg_list = re.split('(['+c_punc+e_punc+'])',content)
                    content_show = ' '.join(jieba.cut(content))
                    content_search = ' '.join(jieba.cut_for_search(content))
                    # for seg in seg_list:
                    #     if len(seg) == 1:
                    #         content_show += ' ' + seg.encode('utf-8')
                    #         content_search += ' ' + seg.encode('utf-8')
                    #     if len(seg) > 1:
                    #         content_show += ' ' + ' '.join(jieba.cut(seg.encode('utf-8'))) 
                    #         content_search += ' ' + ' '.join(jieba.cut_for_search(seg.encode('utf-8'))) 

                    name_not_cut = "".join(my_jieba.cut(name))
                    name_show = " ".join(jieba.cut(name))
                    name_search = " ".join(jieba.cut_for_search(name))
                    tmp = ingredient.split(',')
                    ingredient = ''
                    for t in tmp:
                        ingredient += ','.join(jieba.cut(t)) + ' '
                    tmp = others.split(',')
                    others = ''
                    for t in tmp:
                        others += ','.join(jieba.cut(t)) + ' '


                    doc = Document()
                    doc.add(Field("name", name_show, t2))
                    doc.add(Field("name_not_cut", name_not_cut, t2))
                    doc.add(Field("nameforsearch", name_search, t2))
                    doc.add(Field("img", img, Field.Store.YES, Field.Index.NOT_ANALYZED))
                    doc.add(Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED))
                    doc.add(Field("ingredient", ingredient, t1))
                    doc.add(Field("taste", taste, t1))
                    doc.add(Field("tech", tech, t1))
                    doc.add(Field("others", others, t1))
                    f_show = Field("content", content_show, t2)
                    f_search = Field("contentforsearch", content_search, t2)
                    f_search.setBoost(0.1)
                    doc.add(f_show)
                    doc.add(f_search)
                    writer.addDocument(doc)

                except Exception, e:
                    print "Failed in indexDocs:", e
Пример #11
0
    def indexDocs(self, root, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType()
        t3.setIndexed(False)
        t3.setStored(True)
        t3.setTokenized(False)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)

        t4 = FieldType()
        t4.setIndexed(True)
        t4.setStored(True)
        t4.setTokenized(True)
        t4.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'utf-8')
                    file.close()
                    doc = Document()
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        tmp = contents.split('\n')
                        name1 = list(tmp[0].split())[0]
                        name2 = list(tmp[0].split())[1]
                        tmp2 = tmp[1].split('.')
                        for i in range(len(tmp2)):
                            if tmp2[i] == "www":
                                content.append(tmp2[i + 1])
                        homepage = tmp[1]
                        intro = tmp[2]
                        
                        content = []
                        for i in range(20):
                            content.append(name1)
                            content.append(name2)
                        content.extend(jieba.cut(tmp[2]))

                        logo = tmp[3]
                        goods = ""
                        if len(tmp) > 4:
                            goods = '\n'.join(tmp[4 :])
                            
                            for i in range(len(tmp)):
                                if i > 3:
                                    tmp3 = tmp[i].split()
                                    content.extend(jieba.cut(tmp3[1]))
                        content = ' '.join(content)

                        name1_field = Field("name1", name1, t4)
                        name1_field.setBoost(1.9)
                        doc.add(name1_field)
                        name2_field = Field("name2", name2, t4)
                        name2_field.setBoost(1.9)
                        doc.add(name2_field)
                        doc.add(Field("homepage", homepage, t3))
                        intro_field = Field("intro", intro, t4)
                        doc.add(intro_field)
                        doc.add(Field("intro", intro, t4))
                        doc.add(Field("logo", logo, t3))
                        doc.add(Field("goods", goods, t4))
                        
                        contents_field = Field("contents", content, t2)
                        doc.add(contents_field)

                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e