Пример #1
0
def cql_search(request):
    from pynlpl.formats import fql, cql
    # парсинг входящих параметров
    params = json.loads(request.body.decode('utf-8'))
    # обновление фолиа-документа по актуальным данным
    doc = folia.Document(id='doc')
    text = folia.Text(doc, id='doc.text')
    sentences = Sentence.objects.all()
    # поиск слов в документе
    for s in sentences:
        sen = text.append(folia.Sentence(doc, id=doc.id + '.s.' + str(s.id)))
        words = Word.objects.filter(Sentence_id=s.id)
        for w in words:
            sen.append(
                folia.Word(doc,
                           id=doc.id + '.s.' + str(s.id) + '.w.' + str(w.id),
                           text=w.value))
    doc.append(text)
    query = fql.Query(cql.cql2fql(params['title']))
    texts = query(doc)
    arr = []
    for t in texts:
        arr.append(t[0].parent.id.split('s.')[1])
    sens = Sentence.objects.filter(id__in=arr)
    # вывод результатов
    return render(request, 'cabinet/cql_results.html', {
        'texts': texts,
        'sens': sens
    })
Пример #2
0
def makefoliadoc(outputfile):
    baseid = os.path.basename(outputfile).replace('.folia.xml',
                                                  '').replace('.xml', '')
    foliadoc = folia.Document(id=baseid)
    foliadoc.append(folia.Text(foliadoc, id=baseid + '.text'))

    if not foliadoc.declared(folia.AnnotationType.TOKEN, 'alpino-tokens'):
        foliadoc.declare(folia.AnnotationType.TOKEN, 'alpino-tokens')
    if not foliadoc.declared(folia.LemmaAnnotation, 'alpino-lemmas'):
        foliadoc.declare(folia.LemmaAnnotation, 'alpino-lemmas')
    if not foliadoc.declared(folia.SenseAnnotation, 'alpino-sense'):
        foliadoc.declare(folia.SenseAnnotation, 'alpino-sense')
    if not foliadoc.declared(folia.PosAnnotation, 'alpino-pos'):
        foliadoc.declare(folia.PosAnnotation, 'alpino-pos')
    if not foliadoc.declared(folia.AnnotationType.DEPENDENCY,
                             'alpino-dependency'):
        foliadoc.declare(folia.AnnotationType.DEPENDENCY, 'alpino-dependency')
    if not foliadoc.declared(folia.AnnotationType.SYNTAX, 'alpino-syntax'):
        foliadoc.declare(folia.AnnotationType.SYNTAX, 'alpino-syntax')
    if not foliadoc.declared(folia.AnnotationType.MORPHOLOGICAL,
                             'alpino-morphology'):
        foliadoc.declare(folia.AnnotationType.MORPHOLOGICAL,
                         'alpino-morphology')

    return foliadoc
Пример #3
0
def foliacat(id, outputfile, *files):
    totalmerges = 0
    outputdoc = folia.Document(id=id)
    text = outputdoc.append(folia.Text(outputdoc,id=id + ".text"))
    for i, filename in enumerate(files):
        merges = 0
        print("Processing " + filename, file=sys.stderr)
        inputdoc = folia.Document(file=filename)
        print("(merging document)",file=sys.stderr)

        for annotationtype,set in inputdoc.annotations:
            if not outputdoc.declared(annotationtype,set):
                outputdoc.declare( annotationtype, set)

        for d in inputdoc.data:
            merges += concat(text, d)

        print("(merged " + str(merges) + " elements, with all elements contained therein)",file=sys.stderr)
        totalmerges += merges

    print("(TOTAL: merged " + str(totalmerges) + " elements, with all elements contained therein)",file=sys.stderr)
    if outputfile and merges > 0:
        outputdoc.save(outputfile)

    return outputdoc
Пример #4
0
def convert_text_layer(nafparser, foliadoc):
    textbody = foliadoc.append(folia.Text(foliadoc, id=foliadoc.id + '.text'))
    naf_raw = nafparser.get_raw()
    textbody.append(folia.TextContent, naf_raw)

    prevsent_id = None
    prevpara_id = None
    paragraph = None
    prevword = None
    prev_naf_token = None
    for naf_token in nafparser.get_tokens():
        para_id = naf_token.get_para()
        sent_id = naf_token.get_sent()
        if para_id != prevpara_id:
            if prevpara_id is None:
                #first paragraph, declare for completion's sake
                foliadoc.declare(folia.Paragraph, 'undefined')
            paragraph = textbody.append(folia.Paragraph,
                                        id=foliadoc.id + '.para' + para_id)
        if sent_id != prevsent_id:
            if paragraph:
                sentence = paragraph.append(folia.Sentence,
                                            id=foliadoc.id + '.sent' + sent_id)
            else:
                sentence = textbody.append(folia.Sentence,
                                           id=foliadoc.id + '.sent' + sent_id)

        token_id = naf_token.get_id()
        if prev_naf_token is not None and int(prev_naf_token.get_offset(
        )) + int(prev_naf_token.get_length()) == int(naf_token.get_offset()):
            prevword.space = False
        word = sentence.append(folia.Word, id=foliadoc.id + '.' + token_id)
        offset = int(naf_token.get_offset())
        try:
            offset_valid = naf_raw[
                offset + int(naf_token.get_length())] == naf_token.get_text()
        except IndexError:
            offset_valid = False
        if not offset_valid:
            print(
                "WARNING: NAF error: offset for token " + token_id +
                " does not align properly with raw layer! Discarding offset information for FoLiA conversion",
                file=sys.stderr)
            word.append(folia.TextContent, naf_token.get_text())
        else:
            word.append(folia.TextContent,
                        naf_token.get_text(),
                        offset=naf_token.get_offset(),
                        ref=textbody)

        prevword = word
        prev_naf_token = naf_token

        prevpara_id = para_id
        prevsent_id = sent_id
    return textbody
Пример #5
0
                    elif word:
                        w = folia.Word(foliadoc, text=word, generate_id_in=s)                                                
                        if lemma:
                            w.append( folia.LemmaAnnotation(foliadoc, cls=lemma) ) 
                        if pos:
                            w.append( folia.PosAnnotation(foliadoc, cls=pos) )  
                        s.append(w) 
            
    
else:        
    foliadoc = folia.Document(id=foliaid)
    foliadoc.declare(folia.AnnotationType.TOKEN, set='http://ilk.uvt.nl/folia/sets/ucto-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
    foliadoc.declare(folia.AnnotationType.POS, set='http://ilk.uvt.nl/folia/sets/cgn-legacy.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
    foliadoc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
    foliadoc.language('nld')
    text = folia.Text(foliadoc, id=foliadoc.id + '.text.1') 
    foliadoc.append(text)


    curid = None
    for (fragment, id) in zip(data,idmap):
        if mode == 's' or mode == 'n':
            if id:
                s = folia.Sentence(foliadoc, id=id)            
            else:
                s = folia.Sentence(foliadoc, generate_id_in=text) 
        elif mode == 'p':
            if id:
                p = folia.Paragraph(foliadoc, id=id)            
            else:
                p = folia.Paragraph(foliadoc, generate_id_in=text) 
Пример #6
0
    try:
        os.mkdir(os.path.join(outdir, collection_id))
    except:
        pass
    files = list(glob.glob(os.path.join(compdir,"nl","*.gz"))) + list(glob.glob(os.path.join(compdir, "vl","*.gz")))
    for path in files:
        text_id = os.path.basename(path).split(".")[0]
        print("\t" + text_id)
        full_id = collection_id + "_" + text_id
        au_id = None
        sentence = None

        doc = folia.Document(id=full_id)
        doc.metadatatype = folia.MetaDataType.IMDI
        doc.metadatafile = text_id + ".imdi"
        textbody = doc.append(folia.Text(doc, id=full_id+"."+text_id))
        doc.declare(folia.PosAnnotation, set="hdl:1839/00-SCHM-0000-0000-000B-9")
        doc.declare(folia.LemmaAnnotation, set="hdl:1839/00-SCHM-0000-0000-000E-3")

        fin = gzip.open(path,'r')
        for line in fin:
            line = unicode(line,CGN_ENCODING)
            if line:
                if line[0:3] == '<au':
                    end = line[8:].find('"')
                    if end != -1:
                        end += 8
                        au_id = line[8:end]
                        sentence = textbody.append(folia.Sentence, id=full_id + ".s." + au_id)
                elif line[0:3] == '<mu':
                    au_id = None