def get_character_ngram_distribution(text, n, lowercase=False, stopword=False): """ Get character distribution of text, potentially lowercasing and stopwording first. N.B. This method does not include or count whitespace. :param text: :param lowercase: :param stopword: :return: """ # Return structure character_ngram_distribution = collections.defaultdict(int) # Iterate through tokens for token in get_token_list(text, lowercase=lowercase, stopword=stopword): for char_seq in nltk.ngrams(token, n): character_ngram_distribution[char_seq] += 1 return character_ngram_distribution
def get_definitions_in_sentence(sentence: str, return_sources=False, decode_unicode=True) -> Generator: """ Find possible definitions in natural language in a single sentence. :param decode_unicode: :param return_sources: returns a tuple with the extracted term and the source sentence :param sentence: an input sentence :return: """ result = set() case1_terms = set() if decode_unicode: sentence = unidecode.unidecode(sentence) # case 1 for item in TRIGGER_WORDS_PTN_RE.findall(sentence): result.update(EXTRACT_PTN_RE.findall(item)) case1_terms.update(EXTRACT_PTN_RE.findall(item)) # case 2 result.update(PAREN_PTN_RE.findall(sentence)) # case 3 result.update(NOUN_PTN_RE.findall(sentence)) # case 4 result.update(COLON_PTN_RE.findall(sentence)) # return result for term in result: if term not in case1_terms and len( get_token_list(term)) > MAX_TERM_TOKENS: continue if return_sources: yield (term, sentence) else: yield term
def parse(self, log: ProcessLogger, text, text_unit_id, _text_unit_lang, document_initial_load: bool = False, **kwargs) -> ParseResults: project_id = kwargs.get('project_id') term_stems = dict_data_cache.get_term_config(project_id) text_stems = ' %s ' % ' '.join(get_stems(text, lowercase=True)) text_tokens = get_token_list(text, lowercase=True) term_usages = [] for stemmed_term, data in term_stems.items(): # stem not found in text if stemmed_term not in text_stems: continue # if stem has 1 variant only if data['length'] == 1: count = text_stems.count(stemmed_term) if count: term_data = list(data['values'][0]) term_data.append(count) term_usages.append(term_data) # case when f.e. stem "respons" is equal to multiple terms # ["response", "responsive", "responsibility"] else: for term_data in data['values']: term_data = list(term_data) count = text_tokens.count(term_data[0]) if count: term_data.append(count) term_usages.append(term_data) # TODO: "responsibilities" return ParseResults({ TermUsage: [ TermUsage(text_unit_id=text_unit_id, term_id=pk, count=count) for _, pk, count in term_usages ] })
def train_doc2vec_model(self, data) -> gensim.models.doc2vec.Doc2Vec: """ Train doc2vec model from queryset values :param data: training data - iterable set of texts :return: Doc2Vec trained model """ doc2vec_data = [] for index, text in enumerate(data): if not text: continue # Get tokens with LexNLP text_tokens = get_token_list(text, stopword=True, lowercase=True) # Append gensim object doc2vec_data.append( gensim.models.doc2vec.TaggedDocument(text_tokens, [index])) if not doc2vec_data: raise RuntimeError( 'Empty data set, unable to create Doc2Vec model.') # Train model try: doc2vec_model = gensim.models.doc2vec.Doc2Vec( doc2vec_data, vector_size=self.vector_size, window=self.window, dm=self.dm, min_count=self.min_count, workers=1) # finished training a model (=no more updates, only querying), reduce memory usage doc2vec_model.delete_temporary_training_data( keep_doctags_vectors=True, keep_inference=True) except Exception as e: raise RuntimeError( 'Bad data set, unable to create Doc2Vec model.') from e return doc2vec_model
def normalize_text(text: str, spaces_on_start_end: bool = True, spaces_after_dots: bool = True, lowercase: bool = True, use_stemmer: bool = False, simple_tokenization: bool = False) -> str: """ Normalizes text for substring search operations - extracts tokens, joins them back with spaces, adds missing spaces after dots for abbreviations, e.t.c. Overall aim of this method is to weaken substring matching conditions by normalizing both the text and the substring being searched by the same way removing obsolete differences between them (case, punctuation, ...). :param text: :param spaces_on_start_end: :param spaces_after_dots: :param lowercase: :param simple_tokenization: don't use nltk, just split text by space characters :param use_stemmer: Use stemmer instead of tokenizer. When using stemmer all words will be converted to singular number (or to some the most plain form) before matching. When using tokenizer - the words are compared as is. Using tokenizer should be enough for searches for entities which exist in a single number in the real world - geo entities, courts, .... Stemmer is required for searching for some common objects - table, pen, developer, ... :return: "normazlied" string """ if use_stemmer: tokens = get_stem_list(text, lowercase=lowercase) elif simple_tokenization: tokens = reg_space.split(text) if lowercase: tokens = [t.lower() for t in tokens] else: tokens = get_token_list(text, lowercase=lowercase) res = ' '.join(tokens) if spaces_on_start_end: res = ' ' + res + ' ' if spaces_after_dots: res = res.replace('.', ' . ').replace(' ', ' ') return res
def get_persons(text, strict=False, return_source=False, window=2) -> Generator: """ Get names from text. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) companies = list(get_company_annotations(text)) # Iterate through chunks persons = [] last_person_pos = None for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)): if type(chunk) == nltk.tree.Tree: # Check label if chunk.label() == 'PERSON': if not strict and last_person_pos is not None and ( i - last_person_pos) < window: persons[-1] += " " + " ".join([c[0] for c in chunk]) else: persons.append(" ".join([c[0] for c in chunk])) last_person_pos = i elif not strict and last_person_pos is not None and ( i - last_person_pos) < window: if chunk[1] in ["NNP", "NNPS"]: persons[-1] += " " + chunk[0] last_person_pos = i elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION: if chunk[0].lower() in ["or"]: continue persons[-1] += (" " if chunk[0].lower() in ["&", "and"] else "") + chunk[0] last_person_pos = i else: last_person_pos = None # Cleanup and yield for person in persons: # Cleanup person = person.strip() if len(person) <= 2: continue if PERSONS_STOP_WORDS.search(person): continue person = strip_unicode_punctuation(person).strip( string.punctuation).strip(string.whitespace) if contains_companies(person, companies): continue if person.lower().endswith(" and"): person = person[0:-4] elif person.endswith(" &"): person = person[0:-2] if return_source: yield person, sentence else: yield person
def get_noun_phrases(text, strict=False, return_source=False, window=3, valid_punctuation=None) -> Generator: """ Get NNP phrases from text. :param window: :param return_source: :param strict: :param text: :return: """ valid_punctuation = valid_punctuation or VALID_PUNCTUATION # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) # Iterate through chunks nnps = [] last_nnp_pos = None for i, chunk in enumerate(sentence_pos): do_join = not strict and last_nnp_pos is not None and ( i - last_nnp_pos) < window # Check label if chunk[1] in ["NNP", "NNPS"]: if do_join: sep = "" if "(" in valid_punctuation and nnps[-1][ -1] == "(" else " " nnps[-1] += sep + chunk[0] else: nnps.append(chunk[0]) last_nnp_pos = i elif do_join: if chunk[1] in ["CC"] or chunk[0] in valid_punctuation: if chunk[0].lower() in ["or"]: continue nnps[-1] += (" " if chunk[0].lower() in ["&", "and", "("] else "") + chunk[0] last_nnp_pos = i else: last_nnp_pos = None # Clean up names and yield for nnp in nnps: # Cleanup nnp = nnp.strip() if len(nnp) <= 2: continue if nnp.lower().endswith(" and"): nnp = nnp[0:-4].strip() elif nnp.endswith(" &"): nnp = nnp[0:-2].strip() nnp = strip_unicode_punctuation(nnp).strip( string.punctuation).strip(string.whitespace) if return_source: yield nnp, sentence else: yield nnp
def get_geopolitical(text, strict=False, return_source=False, window=2) -> Generator: """ Get GPEs from text. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) # Iterate through chunks gpes = [] last_gpe_pos = None for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)): if type(chunk) == nltk.tree.Tree: # Check label if chunk.label() == 'GPE': if not strict and last_gpe_pos is not None and ( i - last_gpe_pos) < window: gpes[-1] += " " + " ".join([c[0] for c in chunk]) else: gpes.append(" ".join([c[0] for c in chunk])) last_gpe_pos = i elif not strict and last_gpe_pos is not None and ( i - last_gpe_pos) < window: if chunk[1] in ["NNP", "NNPS"]: gpes[-1] += " " + chunk[0] last_gpe_pos = i elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION: if chunk[0].lower() in ["or"]: continue gpes[-1] += (" " if chunk[0].lower() in ["&", "and"] else "") + chunk[0] last_gpe_pos = i else: last_gpe_pos = None # Clean up names and yield for gpe in gpes: # Cleanup gpe = gpe.strip() if len(gpe) <= 2: continue if gpe.lower().endswith(" and"): gpe = gpe[0:-4] elif gpe.endswith(" &"): gpe = gpe[0:-2] gpe = strip_unicode_punctuation(gpe).strip( string.punctuation).strip(string.whitespace) if return_source: yield gpe, sentence else: yield gpe
def get_organizations(text, strict=False, return_source=False, window=2) -> Generator: """ Get organizations from text. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) # Iterate through chunks organizations = [] last_org_pos = None for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)): if type(chunk) == nltk.tree.Tree: # Check label if chunk.label() in ['ORGANIZATION']: if not strict and last_org_pos is not None and ( i - last_org_pos) < window: organizations[-1] += " " + " ".join( [c[0] for c in chunk]) else: organizations.append(" ".join([c[0] for c in chunk])) last_org_pos = i elif not strict and last_org_pos is not None and ( i - last_org_pos) < window: if chunk[1] in ["NNP", "NNPS"]: organizations[-1] += " " + chunk[0] last_org_pos = i elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION: if chunk[0].lower() in ["or"]: continue organizations[-1] += (" " if chunk[0].lower() in ["&", "and"] else "") + chunk[0] last_org_pos = i else: last_org_pos = None for org in organizations: # Cleanup org = org.strip() if len(org) <= 2: continue if org.lower().endswith(" and"): org = org[0:-4] elif org.endswith(" &"): org = org[0:-2] org = strip_unicode_punctuation(org).strip( string.punctuation).strip(string.whitespace) if return_source: yield org, sentence else: yield org
header='infer') dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_clean/" second_dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_tokenised/" directory = os.fsencode(dir) for file in os.listdir(directory): filename = os.fsdecode(file) celex = filename.split(".txt", 1)[0] print(celex) f = open(dir + filename, "r", encoding='latin1').read() h = f.split("\nTitle: ", 1) title = h[1].split("\nText: ", 1)[0] print(title) text = h[1].split("\nText: ", 1)[1] tokens = ln.get_token_list(text, stopword=True) tokenstring = " ".join(tokens) for i in range(len(df)): if df.loc[i, 'CelexID'] == celex: if df.loc[i + 3, 'CelexID'] == celex: classification = df.loc[i, 'Classes'] classification2 = df.loc[i + 1, 'Classes'] classification3 = df.loc[i + 2, 'Classes'] classification4 = df.loc[i + 3, 'Classes'] docid = df.loc[i, 'DocID'] with open(second_dir + str(docid) + '.txt', "w", encoding='latin1') as newfile:
def process_document(cls, document_text: str): return [ t for t in get_token_list( document_text, stopword=True, lowercase=True) if t.isalpha() ]