Python DocumentParser.parse示例

编程语言: Python

命名空间/包名称: util

类/类型: DocumentParser

方法/功能: parse

hotexamples.com的示例: 6

Python DocumentParser.parse - 已找到6个示例。这些是从开源项目中提取的最受好评的util.DocumentParser.parse现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

DocumentParser(4)

parse(3)

TextTokenizer(1)

parse_raw(1)

示例#1

显示文件

文件： prepare_full_texts.py 项目： Zwackelmann/zb_math_cluster_experiments

def documents2ArffJsonInstancesCorpus(filepaths, tokens2IndexMap):
    p = DocumentParser()

    f = open("raw_data/fulltext-corpus.json", "w")
    f.write("{" + "relation-name\":\"full-text-corpus\"," +
            "num-attributes\":" + str(len(tokens2IndexMap)) + "}\n")

    for filepath in filepaths:
        doc = p.parse(filepath)
        if "zbmath metadata" in doc.includedSources:
            f.write(doc.toArffJsonDocument(tokens2IndexMap) + "\n")
            f.flush()
    f.close()

示例#2

显示文件

文件： prepare_full_texts.py 项目： Zwackelmann/zb_math_cluster_experiments

def documents2ArffJsonInstancesCorpus(filepaths, tokens2IndexMap):
    p = DocumentParser()

    f = open("raw_data/fulltext-corpus.json", "w")
    f.write("{" +
            "relation-name\":\"full-text-corpus\"," +
            "num-attributes\":" + str(len(tokens2IndexMap)) +
            "}\n")

    for filepath in filepaths:
        doc = p.parse(filepath)
        if "zbmath metadata" in doc.includedSources:
            f.write(doc.toArffJsonDocument(tokens2IndexMap) + "\n")
            f.flush()
    f.close()

示例#3

显示文件

文件： prepare_full_texts.py 项目： Zwackelmann/zb_math_cluster_experiments

def dumpDocumentDataMaps(tokens2IndexMap, filenameFilepathsPairs, targetDir):
    p = DocumentParser()

    count = 0
    totalDocs = len(filenameFilepathsPairs)
    for filename, filepath in filenameFilepathsPairs:
        doc = p.parse(filepath)

        print str(count) + " / " + str(totalDocs)

        if "zbmath metadata" in doc.includedSources:
            dataMap = doc.toDataMap(tokens2IndexMap)

            f = open(path.join(targetDir, filename + ".json"), "w")
            f.write(json.dumps(dataMap))
            f.close()

        count += 1

示例#4

显示文件

文件： prepare_full_texts.py 项目： Zwackelmann/zb_math_cluster_experiments

def buildWordCountDict(filepaths):
    p = DocumentParser()

    wordCounts = dict()
    count = 0
    total = len(filepaths)
    for filepath in filepaths:
        print str(count) + "/" + str(total)
        doc = p.parse(filepath)

        if "zbmath metadata" in doc.includedSources:
            for token in doc.tokens:
                if token not in wordCounts:
                    wordCounts[token] = 0
                wordCounts[token] = wordCounts[token] + 1
        count += 1

    return wordCounts

示例#5

显示文件

文件： prepare_full_texts.py 项目： Zwackelmann/zb_math_cluster_experiments

def dumpDocumentDataMaps(tokens2IndexMap, filenameFilepathsPairs, targetDir):
    p = DocumentParser()

    count = 0
    totalDocs = len(filenameFilepathsPairs)
    for filename, filepath in filenameFilepathsPairs:
        doc = p.parse(filepath)

        print str(count) + " / " + str(totalDocs)

        if "zbmath metadata" in doc.includedSources:
            dataMap = doc.toDataMap(tokens2IndexMap)

            f = open(path.join(targetDir, filename + ".json"), "w")
            f.write(json.dumps(dataMap))
            f.close()

        count += 1

示例#6

显示文件

文件： prepare_full_texts.py 项目： Zwackelmann/zb_math_cluster_experiments

def buildWordCountDict(filepaths):
    p = DocumentParser()

    wordCounts = dict()
    count = 0
    total = len(filepaths)
    for filepath in filepaths:
        print str(count) + "/" + str(total)
        doc = p.parse(filepath)

        if "zbmath metadata" in doc.includedSources:
            for token in doc.tokens:
                if token not in wordCounts:
                    wordCounts[token] = 0
                wordCounts[token] = wordCounts[token] + 1
        count += 1

    return wordCounts