def GetDocuments(uID=None, archived=False, start=0, end=50): 'archived: None - Return All Documents; True - Only return archived; False - Only not archived.' if uID: # In the future, this should not only return documents which the uID owns but also display # document the uID have access to. # Check if end is 0. If it is, then there should not be a limit. if archived == None: if end == 0: return Document.objects( Q(owner__iexact=uID) | Q(policies__uID__iexact=uID)).order_by('-dScanned') else: return Document.objects( Q(owner__iexact=uID) | Q(policies__uID__iexact=uID)).order_by( '-dScanned')[start:end] else: if end == 0: return Document.objects( (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID)) & Q(archived=archived)).order_by('-dScanned') else: return Document.objects( (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID)) & Q(archived=archived)).order_by('-dScanned')[start:end] return []
def NewDocument(name, subject, fileName, owner, comments='', desc='', status='Recorded', docID=None): if docID and GetDocByDocID(docID): return -300, docID else: docID = str(int(time.time())) + secrets.token_urlsafe()[:5].lower() try: d = Document(name=name, docID=docID, subject=subject, status=status, dScanned=time.time(), comments=comments, desc=desc, fileName=fileName, owner=owner) d.save() except me.errors.NotUniqueError: return -300, docID return 0, docID
def SearchDocsByName(uID, name, start=0, end=50): if end == 0: return Document.objects( Q(name__icontains=name) & (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID))).order_by('-dScanned') return Document.objects( Q(name__icontains=name) & (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID))).order_by('-dScanned')[start:end]
def SearchDocsByHashTag(uID, hashTag, start=0, end=50): if end == 0: return Document.objects( Q(hashTags__icontains=hashTag) & (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID))).order_by('-dScanned') return Document.objects( Q(hashTags__icontains=hashTag) & (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID)))[start:end].order_by('-dScanned')
def SearchDocsBySubject(uID, subject, start=0, end=50): if end == 0: return Document.objects( Q(subject__icontains=subject) & (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID))).order_by('-dScanned') return Document.objects( Q(subject__icontains=subject) & (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID)))[start:end].order_by('-dScanned')
def load_document(path): html = open(path, "rb").read().decode('utf-8-sig') dom = bs4.BeautifulSoup(html, "html.parser") for extra_tag in dom.find_all(["script", "style"]): extra_tag.decompose() title_node = dom.find('title') title = title_node.text if title_node else '' return Document(path, title, utility.cut_words(dom.text))
def load_dataset(filename: str, labeled: bool): global keywords, documents # 当前文档是文件中的第几行,用于显示搜索结果,与算法无关 line_no = 0 for line in open(filename, "r"): if not line: continue # 当前文档在整个数据库中的编号 doc_id = len(documents) line_no += 1 # 输出进度 if line_no % 100000 == 0: print("Loading %s data: %d lines loaded" % ("labeled" if labeled else "non-labeled", line_no)) if labeled: label = int(line[0]) text = line[1:].strip() else: text = line.strip() label = None # 切词并移除停用词 words = utility.cut_words(text) if len(words) == 0: continue doc = Document( "%s #%d" % ("Labeled" if labeled else "Non-lebeled", line_no), text, words, label) for word in set(words): if word not in keywords: keywords[word] = Keyword(word) # 将该关键词在当前文档中的全部出现添加到倒排索引中 keywords[word].occurs.append( KeywordOccurrenceInDocument( doc_id, list( filter(lambda x: x != None, [ i if words[i] == word else None for i in range(len(words)) ])))) documents.append(doc)
def create_document(title='', author='', doi='', tags=[], **kwargs): """Define a new document in the database. Returns new ID.""" # create new document new_doc = Document(kwargs.get(u'type', u'doc'), title, author, doi, None, kwargs.get(u'parent')) db.session.add(new_doc) db.session.commit() # retrieve new_doc.id # create new tags for tag in tags: if tag: new_tag = Metadata(new_doc.id, u'tag', tag.strip()) db.session.add(new_tag) db.session.commit() return new_doc.id
async def add(): """Add an entry to a project. The arguments must be: project str: the name of the project where to add the new entry to name str: the name/key of the entry in the project. This name will be used when exporting the project. location str: the document's URI description str: a short summary of the file. Typically used to get a quick glance of the file. This is also used when searching. """ form = await request.form try: project = form['project'] name = form['name'] loc = form['location'] description = form.get('description', '') except KeyError: await flash('Could not add content') return redirect('{url_for("index")}') ret = redirect(f'{url_for("index")}#{project}') async with Database(app.config['DATABASE']) as db: project_id = await db.fetch_val( query='SELECT ProjectId FROM Project WHERE Name = :project', values={'project': project}) if project_id is None: await flash('This project does not exist') return ret doc_id = await db.execute(query=Document.insert(), values={'Name': name, 'Location': loc, 'Description': description}) await db.execute(query=ProjectEntry.insert(), values={'ProjectId': project_id, 'DocumentId': doc_id}) return ret
def GetDocByDocID(docID): return Document.objects(docID__iexact=docID).first()
for file_name in os.listdir(doc_dir): if not file_name.endswith('.txt'): continue (url, title, title_words, content_words) = process(F'{doc_dir}/{file_name}') if len(content_words) != 0: doc_tmp.append((url, title, title_words, content_words)) doc_cnt += 1 report(doc_cnt, total) stdout.write('Analysing Files:\n') stdout.flush() for doc_id in range(len(doc_tmp)): (url, title, title_words, content_words) = doc_tmp[doc_id] Documents.append( Document(url, title, len(title_words) + len(content_words))) words = title_words + content_words for word in words: if word not in Keywords: Keywords[word] = Keyword(word) if len(Keywords[word].occurs ) > 0 and Keywords[word].occurs[-1].doc_id == doc_id: continue Keywords[word].occurs.append( KeywordInDoc( doc_id, title_words.count(word) / len(title_words) * 500,