Python tokenizeText示例

编程语言: Python

命名空间/包名称: nlp_functions

方法/功能: tokenizeText

hotexamples.com的示例: 5

Python tokenizeText - 已找到5个示例。这些是从开源项目中提取的最受好评的nlp_functions.tokenizeText现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： doc_representation.py 项目： danieldmm/minerva

def processContext(match,doctext, wleft, wright):
    """
        Optimize the extraction of context by limiting the amount of characters and pre-tokenizing
    """
    left_start=max(0,match.start()-((wleft+15)*ESTIMATED_AVERAGE_WORD_LENGTH))
    right_end=match.end()+((wright+15)*ESTIMATED_AVERAGE_WORD_LENGTH)

    left=doctext[left_start:match.start()] # tokenize!
    left=tokenizeText(removeCitations(left))
    left=removePunctuation(left)

    right=doctext[match.end():right_end] # tokenize!
    right=tokenizeText(removeCitations(right))
    right=removePunctuation(right)

    return {"left":left,"right":right,"left_start":left_start,"right_end":right_end}

示例#2

显示文件

文件： doc_representation.py 项目： danieldmm/minerva

def getDocBOWpassagesMulti(doc, parameters=[100], doctext=None):
    """
        Get BOW for document using full text minus references and section titles

        Args:
            doc: full SciDoc to get text for
        Returns:
             multiple BOWs in a dictionary where the keys are the parameters
    """

    if not doctext:
        doctext=doc.getFullDocumentText(doc, headers=True)

    doctext=removeCitations(doctext).lower()
    tokens=tokenizeText(doctext)
    res={}

    for param in parameters:
        res[param]=[]

        for i in xrange(0, len(tokens), param):
            bow={"text":unTokenize(tokens[i:i+param])}
            res[param].append(bow)

        for i in xrange(param/2, len(tokens), param):
            bow={"text":unTokenize(tokens[i:i+param])}
            res[param].append(bow)

    return res

示例#3

显示文件

文件： doc_representation.py 项目： danieldmm/minerva

def addDocBOWFullTextField(doc,res_dict,doctext=None):
    """
        Adds the _full_text field
    """
    if not doctext:
        doctext=doc.getFullDocumentText(doc)
    doctext=removeCitations(doctext).lower()
    tokens=tokenizeText(doctext)
    res_dict["_full_text"]=unTokenize(tokens)

示例#4

显示文件

文件： doc_representation.py 项目： danieldmm/minerva

def getDocBOWfull(doc, parameters=None, doctext=None):
    """
        Get BOW for document using full text minus references and section titles

        Args:
            doc: full SciDoc to get text for
    """
    if not doctext:
        doctext=doc.getFullDocumentText(doc)
    doctext=removeCitations(doctext).lower()
    tokens=tokenizeText(doctext)
    new_doc={"text":unTokenize(tokens)}
    return {1:[new_doc]} # all functions must take a list of parameters and return dict[parameter]=list of BOWs

示例#5

显示文件

文件： doc_representation.py 项目： danieldmm/minerva

def getDocBOWTitleAbstract(doc, parameters=None, doctext=None):
    """
        Get BOW for document made up of only title and abstract
    """
    paragraphs=[]
    doctext=doc["metadata"]["title"]+". "
    if len(doc.allsections) > 0:
        try:
            doctext+=" " + doc.getSectionText(doc.allsections[0],False)
        except:
            doctext+=u"<UNICODE ERROR>"
    doctext=removeCitations(doctext).lower()
    tokens=tokenizeText(doctext)
    return {1:[{"text":unTokenize(tokens)}]}