def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info): """ Add a document to the index. Does this using direct Lucene access. :param new_doc: dict of fields with values :type new_doc:dict :param metadata: ditto :type metadata:dict :param fields_to_process: only add these fields from the doc dict :type fields_to_process:list """ doc = Document() total_numTerms=bow_info["total_numterms"] # each BOW now comes with its field for field in fields_to_process: field_object=Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES) ## boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) boost=1 / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) field_object.setBoost(float(boost)) doc.add(field_object) json_metadata=json.dumps(metadata) doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO)) doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO)) doc.add(Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def add_article(self, article): # constructing a document doc = Document() title = Field('title', article.title, Field.Store.YES, Field.Index.ANALYZED) title.setBoost(10.0) doc.add(title) description = Field('description', article.description, Field.Store.YES, Field.Index.ANALYZED) description.setBoost(5.0) doc.add(description) doc.add(Field('keywords', article.keywords, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('content', article.content, Field.Store.YES, Field.Index.ANALYZED)) if article.date: doc.add(Field('date', article.date, Field.Store.YES, Field.Index.NOT_ANALYZED)) if article.last_modified: doc.add(Field('last_modified', article.last_modified, Field.Store.YES, Field.Index.NOT_ANALYZED)) if article.images: doc.add(Field('image_url', article.images[0][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('image_text', article.images[0][1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field('url', article.url, Field.Store.YES, Field.Index.NOT_ANALYZED)) # creates document or updates if already exists self.writer.updateDocument(Term("url", article.url), doc)
def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info): """ Add a document to the index. Does this using direct Lucene access. :param new_doc: dict of fields with values :type new_doc:dict :param metadata: ditto :type metadata:dict :param fields_to_process: only add these fields from the doc dict :type fields_to_process:list """ doc = Document() total_numTerms = bow_info["total_numterms"] # each BOW now comes with its field for field in fields_to_process: field_object = Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES) ## boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) boost = 1 / float( math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) field_object.setBoost(float(boost)) doc.add(field_object) json_metadata = json.dumps(metadata) doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO)) doc.add( Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO)) doc.add( Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def testDocBoost(self): writer = self.getWriter( analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT)) f1 = Field("field", "word", TextField.TYPE_STORED) f2 = Field("field", "word", TextField.TYPE_STORED) f2.setBoost(2.0) d1 = Document() d2 = Document() d1.add(f1) # boost = 1 d2.add(f2) # boost = 2 writer.addDocument(d1) writer.addDocument(d2) writer.close() scores = [0.0] * 2 class collector(PythonCollector): def __init__(_self, scores): super(collector, _self).__init__() _self.scores = scores _self.base = 0 def collect(_self, doc, score): _self.scores[doc + _self.base] = score def setNextReader(_self, context): _self.base = context.docBase def acceptsDocsOutOfOrder(_self): return True self.getSearcher().search(TermQuery(Term("field", "word")), collector(scores)) lastScore = 0.0 for score in scores: self.assert_(score > lastScore) lastScore = score
def testDocBoost(self): writer = self.getWriter(analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT)) f1 = Field("field", "word", TextField.TYPE_STORED) f2 = Field("field", "word", TextField.TYPE_STORED) f2.setBoost(2.0) d1 = Document() d2 = Document() d1.add(f1) # boost = 1 d2.add(f2) # boost = 2 writer.addDocument(d1) writer.addDocument(d2) writer.close() scores = [0.0] * 2 class collector(PythonCollector): def __init__(_self, scores): super(collector, _self).__init__() _self.scores = scores _self.base = 0 def collect(_self, doc, score): _self.scores[doc + _self.base] = score def setNextReader(_self, context): _self.base = context.docBase def acceptsDocsOutOfOrder(_self): return True self.getSearcher().search(TermQuery(Term("field", "word")), collector(scores)) lastScore = 0.0 for score in scores: self.assert_(score > lastScore) lastScore = score
def create_index(index) : indexDir = SimpleFSDirectory(File(index)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open('f:/nlp/data/questions/combine.txt') for line in f : line = get_data_from_text(line.decode('utf-8')) doc = Document() field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED) field.setBoost(2.0) doc.add(field) writer.addDocument(doc) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def indexTable(self, writer): #connection con = None #define the index of all the fields #---------step 2:connect to mysql---------- con = mdb.connect('localhost','root','testgce','douban_movie_v3') #t_num = FieldType.NumericType it is wrong!! t_num = FieldType() t_num.setStored(False) t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) maxDict = utils.maxDict #加权数值范围 base = DOC_BOOST_RANGE[0] upper = DOC_BOOST_RANGE[1] with con: # Careful with codecs con.set_character_set('utf8') cur = con.cursor() # Aagin the codecs cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;') #------step 3: choose the right table------ cur.execute("SELECT * FROM movie_items") numrows = int(cur.rowcount) print 'numrows:',numrows for i in range(numrows): print row = cur.fetchone() #------step 4:Index your field------ summary = row[SUMMARY] subject_id = row[SUBJECT_ID] print 'id'+subject_id year = utils.formatYear(row[YEAR]) try: date = DateTools.stringToDate(year.replace('-',' ')) wtfFile = open('wtf.txt','a') dateStr = DateTools.dateToString(date,DateTools.Resolution.DAY) except: wtfFile.write(year+'\n') doc = Document() #boosting boostProb = utils.calcBoostProb(row,maxDict,dateStr) boost = base + boostProb*(upper-base) doc.add(FloatField("boost",boost,Field.Store.YES)) doc.add(StringField("year",dateStr,Field.Store.YES)) print 'dateStr:'+dateStr #A text field is a sequence of terms that has been tokenized while a string field is a single term (although it can also be multivalued.) do_count = row[DO_COUNT] if row[DO_COUNT] != None else 0 wish_count = row[COLLECT_COUNT] if row[WISH_COUNT] != None else 0 #fields which should not be analyzed doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.YES)) doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.YES)) doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.YES)) #doc.add(FloatField("year", float(row[YEAR]), Field.Store.YES).setBoost(boost)) doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.YES)) doc.add(IntField("do_count", int(do_count), Field.Store.YES)) doc.add(IntField("wish_count", int(wish_count), Field.Store.YES)) doc.add(IntField("subject_id", int(row[SUBJECT_ID]), Field.Store.YES)) doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.YES)) doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.YES)) doc.add(StringField("image_small", row[IMAGE_SMALL], Field.Store.YES)) #fields which should be analyzed with WhitespaceAnalyzer #attention!!! dont use a long sentence like : #doc.add(Field("genres", row[GENRES].replace(delim,' '), t3).setBoost(boost)) #or you'll get a null pointer error f = Field("countries", row[COUNTRIES].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) #process casts raw_casts = row[CASTS].replace(delim,' ') f = Field("raw_casts", raw_casts , t1) f.setBoost(boost) doc.add(f) #将英文人名中的 · raw_casts = raw_casts.replace('·',' ') if len(raw_casts.split(' '))<CASTS_LEN: #平局人名长度是4 casts = raw_casts + ' ¥¥¥¥'*(CASTS_LEN-len(raw_casts.split(' '))) f = Field("casts", casts , t3) f.setBoost(boost) doc.add(f) #process directors raw_directors = row[DIRECTORS].replace(delim,' ') f = Field("raw_directors",raw_directors, t1) f.setBoost(boost) doc.add(f) #将英文人名中的 · 替换 raw_directors = raw_directors.replace('·',' ') if len(raw_directors.split(' '))<DIRECTORS_LEN: #平局人名长度是4 directors = raw_directors + ' ¥¥¥¥'*(DIRECTORS_LEN-len(raw_directors.split(' '))) f = Field("directors", directors, t3) f.setBoost(boost) doc.add(f) Field("genres", row[GENRES].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) Field("subtype", row[SUBTYPE].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) #it is wrong cause indexable field has no method setBoost # fieldList = doc.getFields() # is not a python 'list' , but a 'List' which is unindexable # for eachField in fieldList: # eachField.setBoost(boost) #user_tags 原始字符串要存,reRank要用: doc.add(StringField("raw_user_tags",row[USER_TAGS],Field.Store.YES)) doc.add(StringField("raw_others_like",row[OTHERS_LIKE],Field.Store.YES)) user_tags_str = '' others_like_str = '' tags_len = 0 if row[USER_TAGS]!='': user_tags_list = row[USER_TAGS].split(delim) for tag_pair in user_tags_list: if tag_pair!='':#字符串的最后一个字符是¥,这样split之后最后一个元素是空字符 #print 'tag_pair'+tag_pair+'hhe' tag_name = tag_pair.split(delim_uo)[0]+' ' # dont forget this space !! tag_num = tag_pair.split(delim_uo)[1] tag_num_processed = int(int(tag_num)/TAG_SPAN)+1 #最小为1 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! user_tags_str = user_tags_str +' '+ tag_name * tag_num_processed tags_len = tags_len + tag_num_processed #最后得到总共词的个数 if tags_len<TAGS_AVER_LEN: #填充tags,目测3是平均长度,所以使用 ¥¥¥ user_tags_str = user_tags_str +' ¥¥¥'*(TAGS_AVER_LEN - tags_len) # if row[OTHERS_LIKE]!='': for like_pair in row[OTHERS_LIKE].split(delim): if like_pair!='': others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1] #start process adjs if row[ADJS] != None: raw_adjs = row[ADJS][:-1] adjs_str = '' adjs_len = 0 if row[ADJS] != '' and row[ADJS] != '\n': #'重要=4.0,特殊=4.0' adjs_str = row[ADJS] adjs_list = adjs_str.split(',') for adj_pair in adjs_list: #print 'adj_pair:'+adj_pair+'hhe' adj_name = adj_pair.split('=')[0] adj_num = adj_pair.split('=')[1] #去换行符,转换int if adj_num[-1] == '\n': adj_num = adj_num[0:-1] adj_num = int(float(adj_num)) add_adj='' # #同义词 # adj_name_bro = searchDictValue(adjMap,adj_name) # if adj_name_bro == -1: #表示没有结果,即未找到近义词,不添加 # add_adj = '' # else: # add_adj = (adj_name_bro+' ')*adj_num # raw_adjs = raw_adjs + ',' + adj_name_bro+'='+str(adj_num) adjs_str = adjs_str + ' ' + (adj_name+' ') * adj_num +add_adj adjs_len = adjs_len + adj_num #最后得到总共tags的个数 #print raw_adjs doc.add(StringField("raw_adjs",raw_adjs,Field.Store.YES)) if adjs_len<ADJS_AVER_LEN: #填充 adjs_str,目测2是平均长度,所以使用 "¥¥" adjs_str = adjs_str +' ¥¥'*(ADJS_AVER_LEN - adjs_len) f = Field("adjs", adjs_str, t3) f.setBoost(boost) doc.add(f) f = Field("user_tags", user_tags_str, t3) f.setBoost(boost) doc.add(f) f = Field("others_like", others_like_str, t3) f.setBoost(boost) doc.add(f) #fields which should be analyzed with good analyzer f = Field("title", row[TITLE], t3) f.setBoost(boost) doc.add(f) f = Field("original_title", row[ORIGINAL_TITLE], t3) f.setBoost(boost) doc.add(f) f = Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t3) f.setBoost(boost) doc.add(f) f = Field("aka", row[AKA], t2) f.setBoost(boost) doc.add(f) if len(summary) > 0: print subject_id +'--->'+':\n '+ row[TITLE] try: summary_unicoded = unicode(summary, 'utf-8') #test the encoding except Exception,e: print "Decode Failed: ", e f = Field('summary', summary, t2) f.setBoost(boost) doc.add(f) else: print "warning:\n" + subject_id +'---> No content!' print 'boosting:' + str(boost) #for debug if boost>upper: print boostProb print maxDict exit(0) writer.addDocument(doc)
def main(use_elasticsearch=True, calculate_PageRank=False, tele_const=0.2): """ main entry for the indexer module. """ jsons_root_dir = 'JSONs/' # list of addresses of all json files all_json_dirs = glob.glob(jsons_root_dir + '*.json') # first reading all json files jsons = [] for jdir in all_json_dirs: with open(jdir, 'r') as f: jsn = json.load(f) jsons.append(jsn) print len(jsons), ' json files imported.' # now creating a set of all links and then a list of all links in json files print 'creating a list of all links' links_set = set() for js in jsons: links_set.add(js["url"]) for l in js["outlinks"]: links_set.add(l) print len(links_set), ' links found' links = list(links_set) ## if user has selected to index documents using Elasticsearch # Note that when using Elasticsearch, page rank is ignored if use_elasticsearch: from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections print 'Using Elasticsearch for indexing, PageRank is ignored' es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) Book.init('book-index') ## adding all document to the index 'book-index' for idx, js in enumerate(jsons): book = Book(average=js['average'], cover=js['cover'], description=js['description'].encode('utf-8'), ratings=js['ratings'], reviews=js['reviews'], title=js['title'], url=js['url'], outlinks=js['outlinks']) book.add_authors(js['authors']) book.add_userreviews(js['userreviews']) book.id = idx book.save() print 'Elasticsearch index created' ### use pyLucene instead else: import lucene from java.io import File from org.apache.lucene.index import IndexWriterConfig, IndexWriter, FieldInfo from org.apache.lucene.document import Document, Field, FieldType, IntField, FloatField from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer print 'Using Lucene for indexing' ## if user has selected to calculate the PageRank if calculate_PageRank: # now creating the unnormalized adjacency matrix print 'creating the unnormalized adjacency matrix.' adjacency = np.zeros((len(links_set), len(links_set))) for js in jsons: node_idx = links.index(js["url"]) for l in js["outlinks"]: out_idx = links.index(l) adjacency[node_idx, out_idx] += 1 print 'the unnormalized adjacency matrix created.' print 'normalizing the adjacency matrix with teleporting constant value of ', tele_const norm_mat = Normalize(adjacency, tele_const) print 'calculating the PageRank scores' pr_scores = PageRankScore(norm_mat) ## here goes the pyLucene code, which means I should swith to the damn Ubuntu index_folder = '.' index_name = 'lucene.index' index_path = os.path.join(index_folder, index_name) print 'initializing Lucene VM' lucene.initVM() print 'lucene version ', lucene.VERSION version = Version.LUCENE_CURRENT index_store = SimpleFSDirectory(File(index_path)) analyzer = StandardAnalyzer(version) config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) # Options TokenizeFields = True # Title field type title_field = 'title' tft = FieldType() tft.setIndexed(True) tft.setStored(True) tft.setTokenized(TokenizeFields) tft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS ) #only index the document and frequency data # Authors name field type authors_name_field = 'authors_name' anft = FieldType() anft.setIndexed(True) anft.setStored(True) anft.setTokenized(TokenizeFields) anft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # Authors url field type authors_url_field = 'authors_url' auft = FieldType() auft.setIndexed(False) auft.setStored(True) # Average rating field type average_field = 'average' # Cover Image URL field type cover_field = 'cover' cft = FieldType() cft.setIndexed(False) cft.setStored(True) # Book description field type description_field = 'description' descft = FieldType() descft.setIndexed(True) descft.setStored(True) descft.setTokenized(TokenizeFields) descft.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Outlinks field type outlinks_field = "outlinks" outft = FieldType() outft.setIndexed(False) outft.setStored(True) # Ratings count field type ratings_field = 'ratings' # Reviews count field type reviews_field = 'reviews' # URL field type url_field = 'url' uft = FieldType() uft.setIndexed(False) uft.setStored(True) # userreviews.userName field type userreviews_userName_field = 'userreviews_userName' usunft = FieldType() usunft.setIndexed(False) usunft.setStored(True) #userreviews.userReview field type userreviews_userReview_field = 'userreviews_userReview' usurft = FieldType() usurft.setIndexed(True) usurft.setStored(False) usurft.setTokenized(TokenizeFields) usurft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #userreviews.userReviewDate field type userreviews_userReviewDate_field = 'userreviews_userReviewDate' usudft = FieldType() usudft.setIndexed(False) usudft.setStored(True) #userreviews.userURL field type userreviews_userURL_field = 'userreviews_userURL' usuuft = FieldType() usuuft.setIndexed(False) usuuft.setStored(True) docid_field = 'docid' for idx, js in enumerate(jsons): boostVal = js['average'] if calculate_PageRank: boostVal *= pr_scores[links.index(js['url'])] doc = Document() for author in js['authors']: doc.add(Field(authors_name_field, author['name'], anft)) doc.add(Field(authors_url_field, author['url'], auft)) doc.add( FloatField(average_field, float(js['average']), Field.Store.YES)) doc.add(Field(cover_field, js['cover'], cft)) df = Field(description_field, js['description'], descft) df.setBoost(boostVal) doc.add(df) for u in js['outlinks']: doc.add(Field(outlinks_field, u, outft)) doc.add(IntField(ratings_field, js['ratings'], Field.Store.YES)) doc.add(IntField(reviews_field, js['reviews'], Field.Store.YES)) tf = Field(title_field, js['title'], tft) tf.setBoost(boostVal) doc.add(tf) doc.add(Field(url_field, js['url'], uft)) for rev in js['userreviews']: doc.add( Field(userreviews_userName_field, rev['userName'], usunft)) doc.add( Field(userreviews_userReview_field, rev['userReview'], usurft)) doc.add( Field(userreviews_userReviewDate_field, rev['userReviewDate'], usurft)) doc.add( Field(userreviews_userURL_field, rev['userURL'], usuuft)) doc.add(IntField(docid_field, idx, Field.Store.YES)) writer.addDocument(doc) print 'lucene index created' writer.commit() writer.close() print 'writing lucene indexing finished'
def reindex(self): ''' Re-indexes the entire database into Index file''' start = time.time() # get all posts posts = self._tuples_to_dict(self._fetch_all_questions(), self._posts_fields) if not posts: raise Exception("FATAL Error: Could not fetch posts from Database") # open indexer # lucene.initVM(vmargs=['-Djava.awt.headless=true']) # print 'lucene', lucene.VERSION store = SimpleFSDirectory(File(self.index_dir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(store, config) indexedField = FieldType() indexedField.setIndexed(True) indexedField.setStored(True) indexedField.setTokenized(True) indexedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) storedField = FieldType() storedField.setIndexed(False) storedField.setStored(True) storedField.setTokenized(False) storedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) fieldTypes = { 'type' : storedField, 'id' : storedField, 'title' : indexedField, 'question' : indexedField, 'answer' : indexedField, # 'comment' : indexedField, 'tag' : indexedField, 'extra' : indexedField, } # get their comments num_docs = 0 for post in posts: if self.status_mode: print "\r {0:.2f} %complete".format(((num_docs/142627.0)*100)), if self.debug : print "\n","*"*20,"\nIndexing post: ", post['id'], "from ", post['extra'] if self.debug and self.verbose_values: print post answers = self._tuples_to_dict(self._fetch_all_answers(post['id'], post['extra']), self._answer_fields) # add comment field for answer in answers: num_docs += 1 if self.debug: print "\n","+"*10, "\nMaking new Document" doc = Document() if self.debug: print "Adding doc type" doc.add(Field("type", self.doctype, fieldTypes['type'])) # make fields if self.debug: print "Adding post fields" for i in xrange(len(self._posts_fields)): f = Field(self._posts_fields[i], self._cleanup_tag(post[self._posts_fields[i]]), fieldTypes[self._posts_fields[i]]) f.setBoost(self._fields_boost[self._posts_fields[i]]) doc.add(f) if self.status_mode: print "\t Indexing answer: ", answer['answer_id'] if self.debug and self.verbose_values: print answer # answered_doc = copy.deepcopy(doc) # make comment field f = Field("answer", self._cleanup_tag(answer['answer']), fieldTypes['answer']) f.setBoost(self._fields_boost['answer']) doc.add(f) # calculate paths # commented_doc = copy.deepcopy(answered_doc) # comments = self._comments_to_comment_string(self._tuples_to_dict(self._fetch_all_comments(answer['id']), self._comment_fields)) # if self.debug: print "\t\tAdding comments: ", comments # commented_doc.add(Field("comment", self._cleanup_tag(comments), fieldTypes['comment'])) # write index if self.debug: print "\tAdding document {doc_id} to index".format(doc_id=post['id']) writer.addDocument(doc) # del answered_doc # del commented_doc if self.debug: print "Commiting document to index" writer.commit() # close index if self.status_mode: print "Closing index write" writer.close() end = time.time() - start if self.status_mode: print "\n","-"*20, \ "\nTotal time spent in indexing: ", end, "seconds" \ "\nIndexed {num_docs} documents".format(num_docs=num_docs)
def indexDocs(self, root, writer): global countAll print 'indexDocs working' t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: try: path = os.path.join(root, filename) file = open(path) string = file.read() file.close() if string == '': continue name, img, ingredient, taste, tech, others, content = string.split("|",6) if len(img) < 55 and len(img) != 40: continue countAll+=1 print "adding", filename tmplist = content.split('|') url = tmplist[-1] content = "".join(tmplist[:-1]) # content = content.decode('utf-8') # seg_list = re.split('(['+c_punc+e_punc+'])',content) content_show = ' '.join(jieba.cut(content)) content_search = ' '.join(jieba.cut_for_search(content)) # for seg in seg_list: # if len(seg) == 1: # content_show += ' ' + seg.encode('utf-8') # content_search += ' ' + seg.encode('utf-8') # if len(seg) > 1: # content_show += ' ' + ' '.join(jieba.cut(seg.encode('utf-8'))) # content_search += ' ' + ' '.join(jieba.cut_for_search(seg.encode('utf-8'))) name_not_cut = "".join(my_jieba.cut(name)) name_show = " ".join(jieba.cut(name)) name_search = " ".join(jieba.cut_for_search(name)) tmp = ingredient.split(',') ingredient = '' for t in tmp: ingredient += ','.join(jieba.cut(t)) + ' ' tmp = others.split(',') others = '' for t in tmp: others += ','.join(jieba.cut(t)) + ' ' doc = Document() doc.add(Field("name", name_show, t2)) doc.add(Field("name_not_cut", name_not_cut, t2)) doc.add(Field("nameforsearch", name_search, t2)) doc.add(Field("img", img, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("ingredient", ingredient, t1)) doc.add(Field("taste", taste, t1)) doc.add(Field("tech", tech, t1)) doc.add(Field("others", others, t1)) f_show = Field("content", content_show, t2) f_search = Field("contentforsearch", content_search, t2) f_search.setBoost(0.1) doc.add(f_show) doc.add(f_search) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setIndexed(False) t3.setStored(True) t3.setTokenized(False) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) t4 = FieldType() t4.setIndexed(True) t4.setStored(True) t4.setTokenized(True) t4.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'utf-8') file.close() doc = Document() doc.add(Field("path", root, t1)) if len(contents) > 0: tmp = contents.split('\n') name1 = list(tmp[0].split())[0] name2 = list(tmp[0].split())[1] tmp2 = tmp[1].split('.') for i in range(len(tmp2)): if tmp2[i] == "www": content.append(tmp2[i + 1]) homepage = tmp[1] intro = tmp[2] content = [] for i in range(20): content.append(name1) content.append(name2) content.extend(jieba.cut(tmp[2])) logo = tmp[3] goods = "" if len(tmp) > 4: goods = '\n'.join(tmp[4 :]) for i in range(len(tmp)): if i > 3: tmp3 = tmp[i].split() content.extend(jieba.cut(tmp3[1])) content = ' '.join(content) name1_field = Field("name1", name1, t4) name1_field.setBoost(1.9) doc.add(name1_field) name2_field = Field("name2", name2, t4) name2_field.setBoost(1.9) doc.add(name2_field) doc.add(Field("homepage", homepage, t3)) intro_field = Field("intro", intro, t4) doc.add(intro_field) doc.add(Field("intro", intro, t4)) doc.add(Field("logo", logo, t3)) doc.add(Field("goods", goods, t4)) contents_field = Field("contents", content, t2) doc.add(contents_field) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e