示例#1
0
    def __init__(self, text=None, file=None, options={}):
        """
        Instantiate a single Document, either from a long string, or the contents of a file.

        Parameters
        ----------
        text : str

        file : str

        options : dict or namespace

        """
        verbose = False
        self.set_options(options)

        # If reading the document from a file, <text> should be None
        if text is None:
            options['one str'] = True
            text = read_file(file, options=options)

        if self.get('normalize'):
            text = normalize(text, options=options)

        try:
            self.spacy_doc, self.ner, self.vocab = generate_spacy_data(
                text)  # Parse with spacy, get NER
            self.generate_trees()  # Generate Node trees representing sentences
        except:
            raise

        if verbose:
            self.print_sentences()
示例#2
0
def search_line(line, index='default', options=None):
    """
    Search for a given substring in an index

    Parameters
    ----------
    index : str
        name of index where things are to be stored

    line : str
        line to be stored

    """
    line = normalize(line)
    docs = []
    seen = Set([])

    for r in match_search(line, index):
        doc = parse_doc_output(r)
        if not doc['id'] in seen:
            seen.add(doc['id'])
            doc['score'] = phrase_similarity(line, doc['name'])
            docs.append(doc)

    if isTrue(options, 'simple'):
        return docs

    for r in prefix_search(line, index):
        doc = parse_doc_output(r)
        if not doc['id'] in seen:
            seen.add(doc['id'])
            doc['score'] = phrase_similarity(line, doc['name'])
            docs.append(doc)

    for r in wildcard_search(line, index):
        doc = parse_doc_output(r)
        if not doc['id'] in seen:
            seen.add(doc['id'])
            doc['score'] = phrase_similarity(line, doc['name'])
            docs.append(doc)

    docs = sorted(docs, reverse=True, key=lambda x: x['score'])
    assert (isinstance(docs, list))
    return docs
示例#3
0
def word_search(file, index='default'):
    """
    Search for each line of file in the index

    Parameters
    ----------
    index : str
        name of index where things are to be stored

    file : str
        path to file where lines are to be read

    """
    verbose = False
    if file is None:
        err(['word_search requires file'])
        exit()

    seen = Set([])
    iterator = iter_file(file)

    while True:
        try:
            line = iterator.next().rstrip()
            name = None
            try:
                e = line.split('|')
                name = e[0]
            except:
                name = line

            name = normalize(name)

            for word in name.split(' '):
                if len(word) < 3:
                    continue
                if word in seen:
                    pass  # only do each one once
                else:
                    seen.add(word)
                    search(word, {'prefix': True})

        except StopIteration:
            break
示例#4
0
def store_line(line, index=None, field='name'):
    """
    Convert a line of text into a document and store it in an index as 'field'

    Parameters
    ----------
    line : str
        line to be stored

    index : str
        name of index where things are to be stored

    """
    if index is None:
        index = 'default'
    doc_type = 'word'
    doc = {}
    doc[field] = normalize(line)  # Default: just one word on each line

    es.index(index=index, doc_type=doc_type, body=doc)
示例#5
0
def lemmatize_file(file):
    lines = []
    for line in read_file(file):
        line = normalize(line)
        lines.append(lemmatize(line))
    return lines