def __init__(self, text=None, file=None, options={}): """ Instantiate a single Document, either from a long string, or the contents of a file. Parameters ---------- text : str file : str options : dict or namespace """ verbose = False self.set_options(options) # If reading the document from a file, <text> should be None if text is None: options['one str'] = True text = read_file(file, options=options) if self.get('normalize'): text = normalize(text, options=options) try: self.spacy_doc, self.ner, self.vocab = generate_spacy_data( text) # Parse with spacy, get NER self.generate_trees() # Generate Node trees representing sentences except: raise if verbose: self.print_sentences()
def search_line(line, index='default', options=None): """ Search for a given substring in an index Parameters ---------- index : str name of index where things are to be stored line : str line to be stored """ line = normalize(line) docs = [] seen = Set([]) for r in match_search(line, index): doc = parse_doc_output(r) if not doc['id'] in seen: seen.add(doc['id']) doc['score'] = phrase_similarity(line, doc['name']) docs.append(doc) if isTrue(options, 'simple'): return docs for r in prefix_search(line, index): doc = parse_doc_output(r) if not doc['id'] in seen: seen.add(doc['id']) doc['score'] = phrase_similarity(line, doc['name']) docs.append(doc) for r in wildcard_search(line, index): doc = parse_doc_output(r) if not doc['id'] in seen: seen.add(doc['id']) doc['score'] = phrase_similarity(line, doc['name']) docs.append(doc) docs = sorted(docs, reverse=True, key=lambda x: x['score']) assert (isinstance(docs, list)) return docs
def word_search(file, index='default'): """ Search for each line of file in the index Parameters ---------- index : str name of index where things are to be stored file : str path to file where lines are to be read """ verbose = False if file is None: err(['word_search requires file']) exit() seen = Set([]) iterator = iter_file(file) while True: try: line = iterator.next().rstrip() name = None try: e = line.split('|') name = e[0] except: name = line name = normalize(name) for word in name.split(' '): if len(word) < 3: continue if word in seen: pass # only do each one once else: seen.add(word) search(word, {'prefix': True}) except StopIteration: break
def store_line(line, index=None, field='name'): """ Convert a line of text into a document and store it in an index as 'field' Parameters ---------- line : str line to be stored index : str name of index where things are to be stored """ if index is None: index = 'default' doc_type = 'word' doc = {} doc[field] = normalize(line) # Default: just one word on each line es.index(index=index, doc_type=doc_type, body=doc)
def lemmatize_file(file): lines = [] for line in read_file(file): line = normalize(line) lines.append(lemmatize(line)) return lines