def get_definitions(text, return_sources=False, decode_unicode=True) -> Generator: """ Find possible definitions in natural language. :param decode_unicode: :param return_sources: :param text: :return: """ for sentence in get_sentence_list(text): result = set() if decode_unicode: sentence = unidecode.unidecode(sentence) for item in TRIGGER_WORDS_PTN_RE.findall(sentence): result.update(EXTRACT_PTN_RE.findall(item)) # case #2 result.update(PAREN_PTN_RE.findall(sentence)) for term in result: if len(get_token_list(term)) <= MAX_TERM_TOKENS: if return_sources: yield (term, sentence) else: yield term
def get_copyright(text, return_sources=False) -> Generator: """ Find copyright in text. :param text: :param return_sources: :return: """ # Iterate through sentences if COPYRIGHT_PTN_RE.search(text): for sentence in get_sentence_list(text): for phrase in np_extractor.get_np(sentence): cps = COPYRIGHT_PTN_RE.findall(phrase) for cp_text, cp_sign, cp_date, cp_name in cps: # TODO: catch in the general regex if not cp_date: cp_date_at_end = YEAR_PTN_RE.search(cp_name) if cp_date_at_end: cp_date = cp_date_at_end.group() cp_name = re.sub(r'{}$'.format(cp_date), '', cp_name) ret = (cp_sign.strip(), cp_date.replace(' ', ''), cp_name.strip(string.punctuation + string.whitespace)) if return_sources: ret += (cp_text.strip(),) yield ret
def get_constraints(text, strict=False) -> Generator: """ Find possible constraints in natural language. :param text: :param strict: :return: """ # Iterate through all potential matches for sentence in get_sentence_list(text): for match in RE_CONSTRAINT.finditer(sentence.lower()): # Get individual group matches captures = match.capturesdict() num_pre = len(captures["pre"]) num_post = len(captures["post"]) # Skip if strict and empty pre/post if strict and (num_pre + num_post == 0): continue # Setup fields constraint = captures.get("constraint").pop().lower() pre = "".join(captures["pre"]) post = "".join(captures["post"]) if num_post == 0 and num_pre == 1: combined = "{0} {1}".format(pre, constraint).lower().strip() if combined in CONSTRAINT_PHRASES: constraint = combined # Append yield (constraint, pre, post)
def process(input_directory, input_file, section_name): #Speedup: bash runStanfordParserServer.sh, bash runSSTServer.sh import sys import pip import spacy import neuralcoref import lexnlp.nlp.en.segments.sentences as lex_sentences import question_generator as gen import csv import time #Load start_time = time.time() with open(input_file, 'r') as file: brief = file.read() print("--- %s seconds to Load ---" % (time.time() - start_time)) #Preprocess ##start_time = time.time() ##brief = lex_sentences.pre_process_document(brief) ##print("--- %s seconds to LexNLP Preprocess---" % (time.time() - start_time)) start_time = time.time() pronouns = spacy.load('en_core_web_sm') neuralcoref.add_to_pipe(pronouns,greedyness=0.5,max_dist=100,blacklist=False) neural = pronouns(brief) brief = neural._.coref_resolved print("--- %s seconds to Pronoun Fix ---" % (time.time() - start_time)) #Tokenize start_time = time.time() sentences = list(lex_sentences.get_sentence_list(brief)) questions = gen.QuestionGenerator() print("--- %s seconds to Tokenize ---" % (time.time() - start_time))
def get_condition_annotations(text: str, strict=True) \ -> Generator[ConditionAnnotation, None, None]: """ Find possible conditions in natural language. :param text: :param strict: :return: """ # Iterate through all potential matches for sentence in get_sentence_list(text): for match in RE_CONDITION.finditer(sentence): # Get individual group matches captures = match.capturesdict() num_pre = len(captures["pre"]) num_post = len(captures["post"]) # Skip if strict and empty pre/post if strict and (num_pre == 0 or num_post == 0): continue ant = ConditionAnnotation( coords=match.span(), condition=captures["condition"].pop().lower(), pre=captures["pre"].pop(), post=captures["post"].pop()) yield ant
def get_noun_phrases(text, strict=False, return_source=False, window=3, valid_punctuation=None) -> Generator: """ Get NNP phrases from text """ valid_punctuation = valid_punctuation or VALID_PUNCTUATION # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) # Iterate through chunks nnps = [] last_nnp_pos = None for i, chunk in enumerate(sentence_pos): do_join = not strict and last_nnp_pos is not None and ( i - last_nnp_pos) < window # Check label if chunk[1] in ['NNP', 'NNPS']: if do_join: sep = "" if "(" in valid_punctuation and nnps[-1][ -1] == "(" else " " nnps[-1] += sep + chunk[0] else: nnps.append(chunk[0]) last_nnp_pos = i elif do_join: if chunk[1] in ['CC'] or chunk[0] in valid_punctuation: if chunk[0].lower() in ["or"]: continue nnps[-1] += (' ' if chunk[0].lower() in ['&', 'and', '('] else '') + chunk[0] last_nnp_pos = i else: last_nnp_pos = None # Clean up names and yield for nnp in nnps: # Cleanup nnp = nnp.strip() if len(nnp) <= 2: continue if nnp.lower().endswith(' and'): nnp = nnp[0:-4].strip() elif nnp.endswith(' &'): nnp = nnp[0:-2].strip() nnp = strip_unicode_punctuation(nnp).strip( string.punctuation).strip(string.whitespace) if return_source: yield nnp, sentence else: yield nnp
def get_geopolitical(text, strict=False, return_source=False, window=2) -> Generator: """ Get GPEs from text """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) # Iterate through chunks gpes = [] last_gpe_pos = None for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)): if isinstance(chunk, nltk.tree.Tree): # Check label if chunk.label() == 'GPE': if not strict and last_gpe_pos is not None and ( i - last_gpe_pos) < window: gpes[-1] += " " + " ".join([c[0] for c in chunk]) else: gpes.append(" ".join([c[0] for c in chunk])) last_gpe_pos = i elif not strict and last_gpe_pos is not None and ( i - last_gpe_pos) < window: if chunk[1] in ["NNP", "NNPS"]: gpes[-1] += " " + chunk[0] last_gpe_pos = i elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION: if chunk[0].lower() in ["or"]: continue gpes[-1] += (" " if chunk[0].lower() in ["&", "and"] else "") + chunk[0] last_gpe_pos = i else: last_gpe_pos = None # Clean up names and yield for gpe in gpes: # Cleanup gpe = gpe.strip() if len(gpe) <= 2: continue if gpe.lower().endswith(" and"): gpe = gpe[0:-4] elif gpe.endswith(" &"): gpe = gpe[0:-2] gpe = strip_unicode_punctuation(gpe).strip( string.punctuation).strip(string.whitespace) if return_source: yield gpe, sentence else: yield gpe
def get_trademarks(text) -> Generator: """ Find trademarks in text. :param text: :return: """ # Iterate through sentences if TRADEMARK_PTN_RE.search(text): for sentence in get_sentence_list(text): for phrase in np_extractor.get_np(sentence): tms = TRADEMARK_PTN_RE.findall(phrase) for tm in tms: yield tm
def process(input_directory, input_file, section_name): #Speedup: bash runStanfordParserServer.sh, bash runSSTServer.sh import sys import pip import spacy import neuralcoref import lexnlp.nlp.en.segments.sentences as lex_sentences import question_generator as gen import csv import time #Load start_time = time.time() with open(input_file, 'r') as file: brief = file.read() print("--- %s seconds to Load ---" % (time.time() - start_time)) #Preprocess ##start_time = time.time() ##brief = lex_sentences.pre_process_document(brief) ##print("--- %s seconds to LexNLP Preprocess---" % (time.time() - start_time)) start_time = time.time() pronouns = spacy.load('en') neuralcoref.add_to_pipe(pronouns, greedyness=0.5, max_dist=100, blacklist=False) neural = pronouns(brief) brief = neural._.coref_resolved print("--- %s seconds to Pronoun Fix ---" % (time.time() - start_time)) #Tokenize start_time = time.time() sentences = list(lex_sentences.get_sentence_list(brief)) questions = gen.QuestionGenerator() print("--- %s seconds to Tokenize ---" % (time.time() - start_time)) #Print start_time = time.time() with open(input_directory + "/" + section_name + '.csv', 'w') as csvfile: qawriter = csv.writer(csvfile) qawriter.writerow(["Q", "A"]) for sentence in sentences: flashcard = questions.generate_question(sentence) if flashcard: qawriter.writerow([flashcard[0]['Q'], flashcard[0]['A']]) print("--- %s seconds to Generate Questions ---" % (time.time() - start_time))
def get_definitions(text, return_sources=False, decode_unicode=True) -> Generator: """ Find possible definitions in natural language in text. The text will be split to sentences first. :param decode_unicode: :param return_sources: returns a tuple with the extracted term and the source sentence :param text: the input text :return: """ for sentence in get_sentence_list(text): yield from get_definitions_in_sentence(sentence, return_sources, decode_unicode)
def run_sentence_token_test(text, result, lowercase=False, stopword=False): """ Base test method to run against text with given results. """ # Get list from text sentence_list = get_sentence_list(text) # Check length first assert len(sentence_list) == len(result) # Check each sentence matches for i in range(len(sentence_list)): tokens = lexnlp_tests.benchmark_extraction_func(get_token_list, sentence_list[i], lowercase=lowercase, stopword=stopword) assert_list_equal(tokens, result[i])
def process(section_text, section_name): #Speedup: bash runStanfordParserServer.sh, bash runSSTServer.sh import sys import pip import spacy import neuralcoref import lexnlp.nlp.en.segments.sentences as lex_sentences import question_generator as gen import csv import time #Load start_time = time.time() brief = section_text print("--- %s seconds to Load ---" % (time.time() - start_time)) start_time = time.time() pronouns = spacy.load('en') neuralcoref.add_to_pipe(pronouns, greedyness=0.5, max_dist=100, blacklist=False) neural = pronouns(brief) brief = neural._.coref_resolved print("--- %s seconds to Pronoun Fix ---" % (time.time() - start_time)) #Tokenize start_time = time.time() sentences = list(lex_sentences.get_sentence_list(brief)) questions = gen.QuestionGenerator() print("--- %s seconds to Tokenize ---" % (time.time() - start_time)) #Print start_time = time.time() for sentence in sentences: flashcard = questions.generate_question(sentence) if flashcard: partial = { "question": flashcard[0]['Q'], "answer": flashcard[0]['A'] } result[section_name].append(partial) print("--- %s seconds to Generate Questions ---" % (time.time() - start_time))
def get_locations(text, strict=False, return_source=False, window=2) -> Generator: """ Get locations from text using Stanford libraries. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = STANFORD_NER_TAGGER.tag(get_tokens_list(text)) # Iterate through chunks locations = [] last_loc_pos = None for i, token in enumerate(sentence_pos): # Check label if token[1] == 'LOCATION': if not strict and last_loc_pos is not None and ( i - last_loc_pos) < window: locations[-1] += (" " if not token[0].startswith("'") else "") + token[0] else: locations.append(token[0]) last_loc_pos = i else: if token[0] in [".", ","]: if not strict and last_loc_pos is not None and ( i - last_loc_pos) < window: locations[-1] += ( " " if token[0] not in string.punctuation and not token[0].startswith("'") else "") + token[0] last_loc_pos = i # Cleanup and yield for location in locations: location = strip_unicode_punctuation(location).strip( string.punctuation).strip(string.whitespace) if return_source: yield location, sentence else: yield location
def get_persons(text, strict=False, return_source=False, window=2) -> Generator: """ Get persons from text using Stanford libraries. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = STANFORD_NER_TAGGER.tag(get_tokens_list(text)) # Iterate through chunks names = [] last_person_pos = None for i, token in enumerate(sentence_pos): # Check label if token[1] == 'PERSON': if not strict and last_person_pos is not None and ( i - last_person_pos) < window: names[-1] += " " + token[0] else: names.append(token[0]) last_person_pos = i else: if token[0] in [".", ","]: if not strict and last_person_pos is not None and ( i - last_person_pos) < window: names[-1] += (" " if token[0] not in string.punctuation else "") + token[0] last_person_pos = i # Cleanup and yield for name in names: name = strip_unicode_punctuation(name).strip( string.punctuation).strip(string.whitespace) if return_source: yield name, sentence else: yield name
def get_conditions(text, strict=True) -> Generator: """ Find possible conditions in natural language. :param text: :param strict: :return: """ # Iterate through all potential matches for sentence in get_sentence_list(text): for match in RE_CONDITION.finditer(sentence): # Get individual group matches captures = match.capturesdict() num_pre = len(captures["pre"]) num_post = len(captures["post"]) # Skip if strict and empty pre/post if strict and (num_pre == 0 or num_post == 0): continue # Otherwise, append yield (captures["condition"].pop().lower(), captures["pre"].pop(), captures["post"].pop())
def get_persons(text, strict=False, return_source=False, window=2) -> Generator: """ Get names from text. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) companies = list(get_company_annotations(text)) # Iterate through chunks persons = [] last_person_pos = None for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)): if type(chunk) == nltk.tree.Tree: # Check label if chunk.label() == 'PERSON': if not strict and last_person_pos is not None and ( i - last_person_pos) < window: persons[-1] += " " + " ".join([c[0] for c in chunk]) else: persons.append(" ".join([c[0] for c in chunk])) last_person_pos = i elif not strict and last_person_pos is not None and ( i - last_person_pos) < window: if chunk[1] in ["NNP", "NNPS"]: persons[-1] += " " + chunk[0] last_person_pos = i elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION: if chunk[0].lower() in ["or"]: continue persons[-1] += (" " if chunk[0].lower() in ["&", "and"] else "") + chunk[0] last_person_pos = i else: last_person_pos = None # Cleanup and yield for person in persons: # Cleanup person = person.strip() if len(person) <= 2: continue if PERSONS_STOP_WORDS.search(person): continue person = strip_unicode_punctuation(person).strip( string.punctuation).strip(string.whitespace) if contains_companies(person, companies): continue if person.lower().endswith(" and"): person = person[0:-4] elif person.endswith(" &"): person = person[0:-2] if return_source: yield person, sentence else: yield person
def get_companies(text: str, use_article: bool = False, detail_type: bool = False, parse_name_abbr: bool = False, return_source: bool = False) -> Generator: """ Find company names in text, optionally using the stricter article/prefix expression. :param text: :param use_article: :param detail_type: :param parse_name_abbr: :param return_source: :return: """ # Select regex re_c = RE_ARTICLE_COMPANY if use_article else RE_COMPANY # Iterate through sentences for sentence in get_sentence_list(text): for match in re_c.finditer(sentence): captures = match.capturesdict() company_type = captures["company_type_of"] or \ captures["company_type"] or \ captures["company_type_single"] company_type = "".join(company_type).strip( string.punctuation.replace(".", "") + string.whitespace) company_type = company_type or None company_name = "".join(captures["full_name"]) if company_type: company_name = re.sub(r'%s$' % company_type, '', company_name) company_name = company_name.strip( string.punctuation.replace('&', '').replace(')', '') + string.whitespace) company_name = re.sub(r'^\s*(?:and|&|of)\s+|\s+(?:and|&|of)\s*$', '', company_name, re.IGNORECASE) if not company_name: continue # f.e., a Delaware company if company_name.lower().startswith('a ') or captures.get('article') == ['a']: continue company_description = captures["company_description_of"] or \ captures["company_description_and"] or \ captures["company_description"] or \ captures["company_description_single"] company_description = "".join(company_description).strip( string.punctuation + string.whitespace) # catch ABC & Company LLC case if company_description.lower() == 'company' and \ ('& company' in company_name.lower() or 'and company' in company_name.lower()): company_description = None company_description = company_description or None if company_description: company_name = re.sub(r'[\s,]%s$' % company_description, '', company_name) if not company_name or \ ARTICLE_RE.fullmatch(company_name) or \ re.match(r'.+?\s(?:of|in)$', company_name.lower()): continue if company_name in COMPANY_DESCRIPTIONS: continue abbr_name = "".join(captures["abbr_name"]) or None ret = (company_name, company_type) if detail_type: ret += (COMPANY_TYPES[company_type.lower()]['abbr'] if company_type else None, COMPANY_TYPES[company_type.lower()]['label'] if company_type else None) ret += (company_description,) if parse_name_abbr: ret += (abbr_name,) if return_source: ret += (sentence,) # no args: = [company_name, company_type, company_description] # detail_type: + [company_type_abbr, company_type_label] # parse_name_abbr: + [abbr_name] # return_source: + [source] yield ret
def process_fields(doc: LeaseDocument, doc_text: str, task: ExtendedTask): sentences = get_sentence_list(doc_text) # fields = detect_fields(sentences, groups=('address',)) fields = detect_fields(sentences) doc.address = fields.get('address') if not doc.address: doc.address = detect_address_default(doc_text, sentences) # term doc.commencement_date = fields.get('commencement_date') doc.expiration_date = fields.get('expiration_date') term_tuple = fields.get('term') if term_tuple: term = timedelta(days=term_tuple[2]) if doc.commencement_date and not doc.expiration_date: doc.expiration_date = doc.commencement_date + term elif not doc.commencement_date and doc.expiration_date: doc.commencement_date = doc.expiration_date - term if doc.commencement_date \ and doc.expiration_date \ and doc.commencement_date >= doc.expiration_date: doc.expiration_date = None # lease type pay_taxes = int(fields.get('pay_taxes') or False) pay_costs = int(fields.get('pay_costs') or False) pay_insurance = int(fields.get('pay_insurance') or False) lt = pay_taxes + pay_costs + pay_insurance if lt == 3: doc.lease_type = 'triple-net' elif lt == 2: doc.lease_type = 'double-net' elif lt == 1: doc.lease_type = 'single-net' else: doc.lease_type = 'gross' # property type property_types = list(fields.get('property_types__set') or set()) property_types.sort() doc.property_type = '; '.join(property_types) # permitted use doc.permitted_uses = fields.get('permitted_use') # prohibited use doc.prohibited_uses = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('prohibited_use__list')) renew_duration_tuple = fields.get('renew_non_renew_notice') if renew_duration_tuple: doc.renew_non_renew_notice_duration = timedelta(days=renew_duration_tuple[2]) auto_renew = fields.get('auto_renew') if auto_renew is not None: doc.auto_renew = auto_renew area_square_feet_list = fields.get('area_square_feet__list') if area_square_feet_list: doc.area_size_sq_ft = area_square_feet_list[0] doc.alterations_allowed = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('alterations_allowed__list')) security_deposit = fields.get('security_deposit__set') if security_deposit: doc.security_deposit = max(security_deposit) doc.rent_due_frequency = fields.get('rent_due_frequency') mean_rent_per_month = fields.get('mean_rent_per_month__set') if mean_rent_per_month: doc.mean_rent_per_month = max(mean_rent_per_month)
def process_document(document): doc_words = [] for sentence in get_sentence_list(document): doc_words.extend(process_sentence(sentence)) return doc_words
def process_fields(doc: LeaseDocument, doc_text: str, task: ExtendedTask): sentences = get_sentence_list(doc_text) # fields = detect_fields(sentences, groups=('address',)) fields = detect_fields(sentences) doc.address = fields.get('address') if not doc.address: doc.address = detect_address_default(doc_text, sentences) if doc.address: g = geocoder.google(doc.address) if g.ok: doc.address_latitude = g.lat doc.address_longitude = g.lng doc.address_country = g.country_long doc.address_state_province = g.province_long elif g.status and 'ZERO' in g.status: # Google does not know such address - probably we detected it wrong. doc.address = None doc.address_state_province = None doc.address_country = None doc.address_longitude = None doc.address_latitude = None else: task.log_warn( 'Google did not return geocode info for: {0}\nResponse: {1}'.format(doc.address, g)) # return # term doc.commencement_date = fields.get('commencement_date') doc.expiration_date = fields.get('expiration_date') term_tuple = fields.get('term') if term_tuple: term = timedelta(days=term_tuple[2]) if doc.commencement_date and not doc.expiration_date: doc.expiration_date = doc.commencement_date + term elif not doc.commencement_date and doc.expiration_date: doc.commencement_date = doc.expiration_date - term if doc.commencement_date \ and doc.expiration_date \ and doc.commencement_date >= doc.expiration_date: doc.expiration_date = None # lease type pay_taxes = int(fields.get('pay_taxes') or False) pay_costs = int(fields.get('pay_costs') or False) pay_insurance = int(fields.get('pay_insurance') or False) lt = pay_taxes + pay_costs + pay_insurance if lt == 3: doc.lease_type = 'triple-net' elif lt == 2: doc.lease_type = 'double-net' elif lt == 1: doc.lease_type = 'single-net' else: doc.lease_type = 'gross' # property type property_types = list(fields.get('property_types__set') or set()) property_types.sort() doc.property_type = '; '.join(property_types) # permitted use doc.permitted_uses = fields.get('permitted_use') # prohibited use doc.prohibited_uses = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('prohibited_use__list')) renew_duration_tuple = fields.get('renew_non_renew_notice') if renew_duration_tuple: doc.renew_non_renew_notice_duration = timedelta(days=renew_duration_tuple[2]) auto_renew = fields.get('auto_renew') if auto_renew is not None: doc.auto_renew = auto_renew area_square_feet_list = fields.get('area_square_feet__list') if area_square_feet_list: doc.area_size_sq_ft = area_square_feet_list[0] doc.alterations_allowed = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('alterations_allowed__list')) security_deposit = fields.get('security_deposit__set') if security_deposit: doc.security_deposit = max(security_deposit) doc.rent_due_frequency = fields.get('rent_due_frequency') mean_rent_per_month = fields.get('mean_rent_per_month__set') if mean_rent_per_month: doc.mean_rent_per_month = max(mean_rent_per_month)
def test_sentence_segmenter_empty(): """ Test basic sentence segmentation. """ _ = get_sentence_list("")
import sys import lexnlp.nlp.en.segments.sentences as lex_sentences import lexnlp.extract.en.dates as lex_dates import lexnlp.extract.en.entities.nltk_maxent as lex_entities direct_path = "/Users/brandon/Documents/Northwestern Courses/Winter 2019/CS+Law Innovation Lab/Orrick, Harrington, & Sutcliffe/Documents/Dish_Sample.txt" with open(direct_path, 'r') as file: brief = file.read() processed_brief = lex_sentences.pre_process_document(brief) sentences_brief = lex_sentences.get_sentence_list(processed_brief) facts = [] for sentence in sentences_brief: entities = lex_entities.get_persons(sentence) for entity in entities: facts.append((entity, sentence)) for fact in facts: print("Question:\nWhy is {} relevant?\n\nAnswer:\n{}".format( fact[0], fact[1])) print("\n---------------\n") ''' Question: Why is Farmers Branch relevant? Answer: In 2009, DISH began a pilot program to test QPC, a new incentive-based system at several locations, including two of its eight offices in the North Texas region: Farmers Branch and North Richland Hills.
else: text_content = '' except Exception as e: print(("error in content extraction", e)) continue # skip if empty if text_content is None: continue if len(text_content.strip()) == 0: continue try: # build word2vec sentence list and doc2vec content simultaneously doc_stems = [] for sentence in get_sentence_list(text_content): sentence_stems = [ s for s in get_stem_list( sentence, stopword=True, lowercase=True) if s.isalpha() ] doc_stems.extend(sentence_stems) sentences.append(sentence_stems) documents.append( gensim.models.doc2vec.TaggedDocument( doc_stems, ["{0}".format(court_tar_member.name)])) except Exception as e: print(e) # word2vec models
def get_companies(text: str, strict: bool = False, use_gnp: bool = False, detail_type: bool = False, count_unique: bool = False, name_upper: bool = False, parse_name_abbr: bool = False, return_source: bool = False): """ Find company names in text, optionally using the stricter article/prefix expression. :param text: :param strict: :param use_gnp: use get_noun_phrases or NPExtractor :param detail_type: return detailed type (type, unified type, label) vs type only :param name_upper: return company name in upper case. :param count_unique: return only unique companies - case insensitive. :param parse_name_abbr: return company abbreviated name if exists. :param return_source: :return: """ # skip if all text is in uppercase if text == text.upper(): return valid_punctuation = VALID_PUNCTUATION + ["(", ")"] unique_companies = dict() if COMPANY_TYPES_RE.search(text): # Iterate through sentences for sentence in get_sentence_list(text): # skip if whole phrase is in uppercase if sentence == sentence.upper(): continue if use_gnp: phrases = get_noun_phrases(sentence, strict=strict, valid_punctuation=valid_punctuation) else: phrases = np_extractor.get_np(sentence) for phrase in phrases: if COMPANY_TYPES_RE.search(phrase): for result in nltk_re.get_companies(phrase, detail_type=True, parse_name_abbr=True): co_name, co_type, co_type_abbr, co_type_label, co_desc, co_abbr = result if co_name == co_type or co_name == co_desc: continue if name_upper: co_name = co_name.upper() result = (co_name, co_type) if detail_type: result += (co_type_abbr, co_type_label, co_desc) if parse_name_abbr: result += (co_abbr,) if return_source and not count_unique: result = result + (sentence,) if count_unique: unique_key = (result[0].lower() if result[0] else None, co_type_abbr) existing_result = unique_companies.get(unique_key) if existing_result: unique_companies[unique_key] = existing_result[:-1] + (existing_result[-1] + 1,) else: unique_companies[unique_key] = result + (1,) else: yield result if count_unique: for company in unique_companies.values(): yield company
##brief = lex_sentences.pre_process_document(brief) ##print("--- %s seconds to LexNLP Preprocess---" % (time.time() - start_time)) start_time = time.time() pronouns = spacy.load('en') neuralcoref.add_to_pipe(pronouns, greedyness=0.5, max_dist=100, blacklist=False) neural = pronouns(brief) brief = neural._.coref_resolved print("--- %s seconds to Pronoun Fix ---" % (time.time() - start_time)) #Tokenize start_time = time.time() sentences = list(lex_sentences.get_sentence_list(brief)) questions = gen.QuestionGenerator() print("--- %s seconds to Tokenize ---" % (time.time() - start_time)) #Print start_time = time.time() with open( '/Users/brandon/Documents/Northwestern Courses/Winter 2019/CS+Law Innovation Lab/Orrick, Harrington, & Sutcliffe/Documents/ex.csv', 'w') as csvfile: qawriter = csv.writer(csvfile) qawriter.writerow(["Q", "A"]) for sentence in sentences: flashcard = questions.generate_question(sentence) if flashcard: qawriter.writerow([flashcard[0]['Q'], flashcard[0]['A']]) print("--- %s seconds to Generate csv ---" % (time.time() - start_time))
def get_organizations(text, strict=False, return_source=False, window=2) -> Generator: """ Get organizations from text. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) # Iterate through chunks organizations = [] last_org_pos = None for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)): if type(chunk) == nltk.tree.Tree: # Check label if chunk.label() in ['ORGANIZATION']: if not strict and last_org_pos is not None and ( i - last_org_pos) < window: organizations[-1] += " " + " ".join( [c[0] for c in chunk]) else: organizations.append(" ".join([c[0] for c in chunk])) last_org_pos = i elif not strict and last_org_pos is not None and ( i - last_org_pos) < window: if chunk[1] in ["NNP", "NNPS"]: organizations[-1] += " " + chunk[0] last_org_pos = i elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION: if chunk[0].lower() in ["or"]: continue organizations[-1] += (" " if chunk[0].lower() in ["&", "and"] else "") + chunk[0] last_org_pos = i else: last_org_pos = None for org in organizations: # Cleanup org = org.strip() if len(org) <= 2: continue if org.lower().endswith(" and"): org = org[0:-4] elif org.endswith(" &"): org = org[0:-2] org = strip_unicode_punctuation(org).strip( string.punctuation).strip(string.whitespace) if return_source: yield org, sentence else: yield org
def get_sentences(self, text=None): if not text: text = self.text return list(lex_sentences.get_sentence_list(text))