Exemplo n.º 1
0
def GetDocuments(uID=None, archived=False, start=0, end=50):
    'archived: None - Return All Documents; True - Only return archived; False - Only not archived.'

    if uID:
        # In the future, this should not only return documents which the uID owns but also display
        # document the uID have access to.

        # Check if end is 0. If it is, then there should not be a limit.
        if archived == None:
            if end == 0:
                return Document.objects(
                    Q(owner__iexact=uID)
                    | Q(policies__uID__iexact=uID)).order_by('-dScanned')
            else:
                return Document.objects(
                    Q(owner__iexact=uID)
                    | Q(policies__uID__iexact=uID)).order_by(
                        '-dScanned')[start:end]
        else:
            if end == 0:
                return Document.objects(
                    (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID))
                    & Q(archived=archived)).order_by('-dScanned')
            else:
                return Document.objects(
                    (Q(owner__iexact=uID) | Q(policies__uID__iexact=uID))
                    & Q(archived=archived)).order_by('-dScanned')[start:end]

    return []
Exemplo n.º 2
0
def NewDocument(name,
                subject,
                fileName,
                owner,
                comments='',
                desc='',
                status='Recorded',
                docID=None):
    if docID and GetDocByDocID(docID):
        return -300, docID
    else:
        docID = str(int(time.time())) + secrets.token_urlsafe()[:5].lower()
    try:
        d = Document(name=name,
                     docID=docID,
                     subject=subject,
                     status=status,
                     dScanned=time.time(),
                     comments=comments,
                     desc=desc,
                     fileName=fileName,
                     owner=owner)
        d.save()
    except me.errors.NotUniqueError:
        return -300, docID

    return 0, docID
Exemplo n.º 3
0
def SearchDocsByName(uID, name, start=0, end=50):
    if end == 0:
        return Document.objects(
            Q(name__icontains=name)
            & (Q(owner__iexact=uID)
               | Q(policies__uID__iexact=uID))).order_by('-dScanned')
    return Document.objects(
        Q(name__icontains=name)
        & (Q(owner__iexact=uID)
           | Q(policies__uID__iexact=uID))).order_by('-dScanned')[start:end]
Exemplo n.º 4
0
def SearchDocsByHashTag(uID, hashTag, start=0, end=50):
    if end == 0:
        return Document.objects(
            Q(hashTags__icontains=hashTag)
            & (Q(owner__iexact=uID)
               | Q(policies__uID__iexact=uID))).order_by('-dScanned')
    return Document.objects(
        Q(hashTags__icontains=hashTag)
        & (Q(owner__iexact=uID)
           | Q(policies__uID__iexact=uID)))[start:end].order_by('-dScanned')
Exemplo n.º 5
0
def SearchDocsBySubject(uID, subject, start=0, end=50):
    if end == 0:
        return Document.objects(
            Q(subject__icontains=subject)
            & (Q(owner__iexact=uID)
               | Q(policies__uID__iexact=uID))).order_by('-dScanned')
    return Document.objects(
        Q(subject__icontains=subject)
        & (Q(owner__iexact=uID)
           | Q(policies__uID__iexact=uID)))[start:end].order_by('-dScanned')
def load_document(path):
    html = open(path, "rb").read().decode('utf-8-sig')
    dom = bs4.BeautifulSoup(html, "html.parser")
    for extra_tag in dom.find_all(["script", "style"]):
        extra_tag.decompose()

    title_node = dom.find('title')
    title = title_node.text if title_node else ''
    return Document(path, title, utility.cut_words(dom.text))
Exemplo n.º 7
0
def load_dataset(filename: str, labeled: bool):
    global keywords, documents

    # 当前文档是文件中的第几行,用于显示搜索结果,与算法无关
    line_no = 0
    for line in open(filename, "r"):
        if not line:
            continue

        # 当前文档在整个数据库中的编号
        doc_id = len(documents)
        line_no += 1

        # 输出进度
        if line_no % 100000 == 0:
            print("Loading %s data: %d lines loaded" %
                  ("labeled" if labeled else "non-labeled", line_no))

        if labeled:
            label = int(line[0])
            text = line[1:].strip()
        else:
            text = line.strip()
            label = None

        # 切词并移除停用词
        words = utility.cut_words(text)
        if len(words) == 0:
            continue

        doc = Document(
            "%s #%d" % ("Labeled" if labeled else "Non-lebeled", line_no),
            text, words, label)
        for word in set(words):
            if word not in keywords:
                keywords[word] = Keyword(word)

            # 将该关键词在当前文档中的全部出现添加到倒排索引中
            keywords[word].occurs.append(
                KeywordOccurrenceInDocument(
                    doc_id,
                    list(
                        filter(lambda x: x != None, [
                            i if words[i] == word else None
                            for i in range(len(words))
                        ]))))

        documents.append(doc)
Exemplo n.º 8
0
def create_document(title='', author='', doi='', tags=[], **kwargs):
    """Define a new document in the database. Returns new ID."""
    # create new document
    new_doc = Document(kwargs.get(u'type', u'doc'), title, author,
                       doi, None, kwargs.get(u'parent'))
    db.session.add(new_doc)
    db.session.commit()  # retrieve new_doc.id

    # create new tags
    for tag in tags:
        if tag:
            new_tag = Metadata(new_doc.id, u'tag', tag.strip())
            db.session.add(new_tag)

    db.session.commit()
    return new_doc.id
Exemplo n.º 9
0
async def add():
    """Add an entry to a project.

    The arguments must be:
        project str: the name of the project where to add the new entry to
        name str: the name/key of the entry in the project.
                  This name will be used when exporting the project.
        location str: the document's URI
        description str: a short summary of the file. Typically used to get
                        a quick glance of the file. This is also used when
                        searching.
    """
    form = await request.form

    try:
        project = form['project']
        name = form['name']
        loc = form['location']
        description = form.get('description', '')
    except KeyError:
        await flash('Could not add content')
        return redirect('{url_for("index")}')

    ret = redirect(f'{url_for("index")}#{project}')

    async with Database(app.config['DATABASE']) as db:
        project_id = await db.fetch_val(
            query='SELECT ProjectId FROM Project WHERE Name = :project',
            values={'project': project})

        if project_id is None:
            await flash('This project does not exist')
            return ret

        doc_id = await db.execute(query=Document.insert(),
                                  values={'Name': name,
                                          'Location': loc,
                                          'Description': description})

        await db.execute(query=ProjectEntry.insert(),
                         values={'ProjectId': project_id,
                                 'DocumentId': doc_id})
    return ret
Exemplo n.º 10
0
def GetDocByDocID(docID):
    return Document.objects(docID__iexact=docID).first()
    for file_name in os.listdir(doc_dir):
        if not file_name.endswith('.txt'):
            continue
        (url, title, title_words,
         content_words) = process(F'{doc_dir}/{file_name}')
        if len(content_words) != 0:
            doc_tmp.append((url, title, title_words, content_words))
        doc_cnt += 1
        report(doc_cnt, total)

    stdout.write('Analysing Files:\n')
    stdout.flush()
    for doc_id in range(len(doc_tmp)):
        (url, title, title_words, content_words) = doc_tmp[doc_id]
        Documents.append(
            Document(url, title,
                     len(title_words) + len(content_words)))

        words = title_words + content_words

        for word in words:
            if word not in Keywords:
                Keywords[word] = Keyword(word)

            if len(Keywords[word].occurs
                   ) > 0 and Keywords[word].occurs[-1].doc_id == doc_id:
                continue

            Keywords[word].occurs.append(
                KeywordInDoc(
                    doc_id,
                    title_words.count(word) / len(title_words) * 500,