def get_doc(self, filename, path, title, url, contents): ''' Generate a `Document` according to the parameters. Input: `filename`: filename of the webpage `path`: path of the webpage `title`: title of the webpage `url`: original url of the webpage `contents`: contents of the webpage Output: `Document` with the fields initialized ''' doc = Document() # doc.add(Field("name", filename, self.property_type)) # doc.add(Field("path", path, self.property_type)) # doc.add(Field("title", title, self.property_type)) # doc.add(Field("url", url, self.property_type)) doc.add(StringField("name", filename, Field.Store.YES)) doc.add(StringField("path", path, Field.Store.YES)) doc.add(TextField("title", title, Field.Store.YES)) doc.add(TextField("url", url, Field.Store.YES)) if len(contents) > 0: # doc.add(Field("contents", contents, self.content_type)) doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(filename)) return doc
def indexDocs(self, root, writer): for root,dirnames,filenames in os.walk(root): for dirname in dirnames: #遍历文件夹 path1 = os.path.join(root,dirname) for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件 for filename in filenames: #print(root,dirnames,filename) print("adding", filename) # try: path = os.path.join(path1, filename) file = open(path, encoding='utf8') page = file.readline() title = file.readline() contents = file.read() file.close() # jieba 分词 seg_contents = jieba.lcut_for_search(contents) contents = ' '.join(seg_contents) url = page seg_url = jieba.lcut_for_search(page) page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www']))) doc = Document() doc.add(StringField("name", filename, Field.Store.YES)) doc.add(StringField("path", path, Field.Store.YES)) if len(contents) > 0: doc.add(TextField('title', title, Field.Store.YES)) doc.add(TextField('site', page, Field.Store.YES)) doc.add(TextField('url',url,Field.Store.YES)) doc.add(TextField('contents', contents, Field.Store.YES)) else: print("warning: no content in %s" % filename) writer.addDocument(doc)
def addDoc(w, name, birth_date, death_date, birth_note, death_note): doc = Document() doc.add(TextField("name", name, Field.Store.YES)) doc.add(StringField("birth_date", birth_date, Field.Store.YES)) doc.add(StringField("death_date", death_date, Field.Store.YES)) doc.add(StringField("birth_note", birth_note, Field.Store.YES)) doc.add(StringField("death_note", death_note, Field.Store.YES)) w.addDocument(doc)
def createDocument(item_id, label, viewSimilar, viewProspective): doc = Document() doc.add(StringField('itemID', item_id, Field.Store.YES)) doc.add(StringField('label', label, Field.Store.YES)) for item in viewSimilar: doc.add(StoredField("viewSimilar", item)) for item in viewProspective: doc.add(StoredField("viewProspective", item)) return doc
def create_document_by_document_sentence(org_title, preprocessed_title, doc_id, sentence): doc = Document() # create a new document doc.add(StringField("org_title", org_title, Field.Store.YES)) doc.add( TextField("preprocessed_title", preprocessed_title, Field.Store.YES)) doc.add(StringField("doc_id", str(doc_id), Field.Store.YES)) # doc.add(StringField("content", content, Field.Store.YES)) doc.add(TextField("sentence", sentence, Field.Store.YES)) return doc
def create_doc(item_id, label, viewSimilar, viewProspective, model="default"): doc = Document() now_time = int(time.time()) _id = hashlib.md5(f"{label}_{item_id}".encode('utf-8')).hexdigest() doc.add(StringField("id", _id, Field.Store.NO)) doc.add(StringField("itemID", item_id, Field.Store.YES)) doc.add(StringField("label", label, Field.Store.YES)) doc.add(StoredField("viewSimilar", viewSimilar)) doc.add(StoredField("viewProspective", viewProspective)) doc.add(StringField("model", model, Field.Store.YES)) doc.add(StringField("ttl", str(now_time), Field.Store.NO)) return _id, doc
def obj_to_document(obj): def conv_to_str(x): if isinstance(x, unicode): return x.encode('utf8') return str(x) res = Document() res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES)) res.add(StringField('type', obj.__class__.__name__, Field.Store.YES)) for k, v in vars(obj.data).items(): if v is None: res.add(Field(k, '', Field.Store.YES, Field.Index.NO)) fieldtype = LT_NONE elif isinstance(v, list): if len(v) > 0 and isinstance(v[0], int): res.add( TextField(k, ' '.join((str(x) for x in set(v))), Field.Store.YES)) fieldtype = LT_INTLIST else: res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES)) fieldtype = LT_LIST elif isinstance(v, str) or isinstance(v, unicode): res.add(Field(k, v, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v)), Field.Store.NO)) fieldtype = LT_STRING elif isinstance(v, hyper_text): res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v.text)), Field.Store.NO)) fieldtype = LT_HYPERTEXT elif isinstance(v, bool): if v: vs = '1' else: vs = '0' res.add(StringField(k, vs, Field.Store.YES)) fieldtype = LT_BOOL elif isinstance(v, int) or isinstance(v, long): res.add(StringField(k, str(v), Field.Store.YES)) fieldtype = LT_INT else: raise Exception('unrecognized data type') res.add( Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO)) return res
def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close()
def addDoc(w, data): doc = Document() for field in data: value, type = data[field][0], data[field][1] ''' if type!='INTEGER_STORED': #print ('field=%s len=%d'%(field,len(value))) print ('field=%s value=%s'%(field,value)) else: print ('field=%s value=%d'%(field,value)) ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') try: w.addDocument(doc) except: #print ('error cat=%s'%(data['category'][0])) print('-----------------------------------') for field in data: value, type = data[field][0], data[field][1] print('field=%s\nvalue=%s' % (field, str(value)))
def createDoc(self, url, html, duplicate): title, contents = self.parseHtml(url, html) doc = Document() doc.add(StringField("title", title, Field.Store.YES)) doc.add(StringField("url", url, Field.Store.YES)) doc.add( StringField("duplicate", str(duplicate).lower(), Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print "Warning: No content in %s" % url return doc
def add_doc(self, item_data): item_id = item_data['item_id'] ttl = item_data['ttl'] version = item_data.get('version', 'default') view_similar = json.dumps(item_data.get('view_similar', {})) view_prospective = json.dumps(item_data.get('view_prospective', {})) doc = Document() _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest() doc.add(StringField("id", _id, Field.Store.NO)) doc.add(LongPoint("ttl", ttl)) doc.add(StringField("version", version, Field.Store.YES)) doc.add(StringField("item_id", item_id, Field.Store.YES)) doc.add(StoredField("view_similar", view_similar)) doc.add(StoredField("view_prospective", view_prospective)) self.writer.updateDocument(Term("id", _id), doc)
def add_code_keyword_into_document(document, file_content, node, counter): # Flag is set when at least 1 code characteristics has been stored flag = False # document.add(Field("line_numbers", str(dict(node["line_numbers"])), Field.Store.YES, Field.Index.NO)) # document.add(Field("hash", str(md5(file_content)), Field.Store.YES, Field.Index.NO)) # document.add(Field("code", so_tokenizer(file_content, False), Field.Store.YES, Field.Index.ANALYZED)) for m in node["typed_method_call"]: if m: document.add( Field("word", m, Field.Store.YES, Field.Index.ANALYZED)) counter.typed_method_call_count += 1 flag = True for e in node["extends"]: if e: document.add(Field("word", e, Field.Store.NO, Field.Index.ANALYZED)) counter.extends_count += 1 for c in node["used_classes"]: if c: document.add( Field("word", str(c), Field.Store.YES, Field.Index.ANALYZED)) counter.used_classes_count += 1 for i in node["class_instance_creation"]: if i: document.add( Field("word", i, Field.Store.YES, Field.Index.ANALYZED)) counter.class_instance_creation_count += 1 flag = True for m in node["methods"]: if m: document.add( Field("word", m, Field.Store.YES, Field.Index.ANALYZED)) counter.methods_count += 1 for m in node["methods_called"]: if m: document.add( Field("word", m, Field.Store.YES, Field.Index.ANALYZED)) counter.methods_called_count += 1 flag = True for m in node["unresolved_method_calls"]: if m: document.add( Field("word", m, Field.Store.YES, Field.Index.ANALYZED)) counter.unresolved_method_calls_count += 1 for l in node["literals"]: if l: document.add(StringField("word", l, Field.Store.YES)) counter.literals_count += 1 flag = True return flag
def addDocument(self, id): global answers_train preA = answers_train[id] doc = Document() doc.add(TextField("pa", preA, Field.Store.YES)) doc.add(StringField("id", str(id), Field.Store.YES)) self.w.addDocument(doc) self.w.commit()
def create_document(file_name): path = './alldocs/' + file_name file = open(path) doc = Document() doc.add(StringField("title", input_file, Field.Store.YES)) doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() return doc
def get_doc(self, doc_info, contents): ''' Generate a `Document` according to the given info. Input: `doc_info`: info of the doc (`name`, `path`, `title`, `url`, `site`) `contents`: contents of the webpage Output: `Document` with the fields initialized ''' doc = Document() doc.add(StringField("name", doc_info['name'], Field.Store.YES)) doc.add(StringField("path", doc_info['path'], Field.Store.YES)) doc.add(StringField("title", doc_info['title'], Field.Store.YES)) doc.add(StringField("url", doc_info['url'], Field.Store.YES)) doc.add(TextField("site", doc_info['site'], Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(doc_info['name'])) return doc