def test_phrase_matcher_sent_start(en_vocab, attr): _ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841
#Tokenizing for token in doc: print("tokens: ", token) #Text preprocessing print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword')) print("-" * 40) for token in doc: print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}") #--------------------------------------- #Pattern Matching from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(nlp.vocab, attr='LOWER') terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel'] patterns = [nlp(text) for text in terms] matcher.add("TerminologyList", None, *patterns) print("i) ", patterns) text_doc = nlp( "Glowing review overall, and some really interesting side-by-side " "photography tests pitting the iPhone 11 Pro against the " "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") matches = matcher(text_doc) print(matches) match_id, start, end = matches[0]
def deidentifier_func(input_string, nlp_trained_model, nlp_blank_model, choice): #doc = nlp_trained_model((open(input_string)).read()) doc = nlp_trained_model(input_string) #original_string = open((input_string)).read() original_string = input_string # ** Calling extract_regex function to get list of all the matched regex pattern ** date_list = extract_regex( r"\D([0-9]{4}|[0-9]{1,2})(\/|-)[0-9]{1,2}(\/|-)([0-9]{1,2}|[0-9]{4})\D", doc, original_string) for i in range(len(date_list)): date_list[i][1] = date_list[i][1] + 1 date_list[i][2] = date_list[i][2] - 1 date_list[i][0] = original_string[date_list[i][1]:date_list[i][2]] # ** For choice 1 ** """if(choice == 1): for a in date_list: count = 0 for i in range(a[1], a[1] + 4): if(original_string[i].isnumeric()): count = count + 1 if(count == 4): original_string=original_string[:a[1]+4]+''*(a[2]-a[1]-4)+original_string[a[2]:] else: count = 0 for j in range(a[2], a[2]-5, -1): if(original_string[j].isnumeric()): count = count + 1 if(count == 4): original_string=original_string[:a[1]]+''*(a[2]-a[1]-4)+original_string[a[2]-4:] elif(count == 3): original_string=original_string[:a[1]]+''*(a[2]-a[1]-2)+original_string[a[2]-2:] else: original_string=original_string[:a[1]]+''*(a[2]-a[1])+original_string[a[2]:] """ # ** For Choice 2 ** date_shift = [] temp_1 = 0 temp_2 = 0 random_value = randint(0, 90) if (choice == 2): for temp in range(len(date_list)): temp_list = [] text = date_list[temp][0] start = date_list[temp][1] + temp_2 end = date_list[temp][2] + temp_2 # Converting dates to pandas datetime so as to use timedelta function pandas_date = pd.to_datetime(text, infer_datetime_format=True, errors='ignore') if (type(pandas_date) != str): pandas_date = pandas_date + timedelta(days=random_value) original_string = original_string[:start] + str( pandas_date)[:-9] + original_string[end:] temp_2 = temp_2 + (len(str(pandas_date)[:-9]) - len(text)) temp_list.append(str(pandas_date)[:-9]) temp_list.append(start) temp_list.append(start + len(str(pandas_date)[:-9])) date_shift.append(temp_list) # ** Extracting all various identifiers using regex pattern ** #dob_list = extract_regex(r"^(0[1-9]|1[012])[-/.](0[1-9]|[12][0-9]|3[01])[-/.](19|20)\\d\\d$", # doc, original_string) aadhar_list = extract_regex(r"(\d{4}(\s|\-)\d{4}(\s|\-)\d{4})", doc, original_string) ssn_list = extract_regex(r"^\d{9}$", doc, original_string) mail_list = extract_regex( r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])", doc, original_string) ip_list = extract_regex( r"((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)", doc, original_string) # ** Now de-identifying them ** #for a in dob_list: # original_string = original_string[:a[1]]+'X'*(a[2]-a[1])+original_string[a[2]:] for a in aadhar_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] for a in ssn_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] for a in mail_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] for a in ip_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] # ** Now to extract urls and licence plate numbers from last updated original_string # and then deidentifying them too ** doc = nlp_trained_model(original_string) url_list = extract_regex( r"(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?", doc, original_string) license_plate_list = extract_regex( r"[A-Z]{2}[ -][0-9]{1,2}(?: [A-Z])?(?: [A-Z]*)? [0-9]{4}", doc, original_string) for a in ip_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] for a in ip_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] # ** Now to extract contact details i.e phone numbers and fax numbers from last updated # original_string and then deidentifying them too ** doc = nlp_trained_model(original_string) #indian_ph_no = extract_regex(r"((\+*)((0[ -]+)*|(91 )*)(\d{12}+|\d{10}+))|\d{5}([- ]*)\d{6}", # doc, original_string) usa_ph_no = extract_regex(r"^(\([0-9]{3}\) |[0-9]{3}-)[0-9]{3}-[0-9]{4}$", doc, original_string) phone_fax_list1 = extract_regex( r"(?:(?:(?:(\+)((?:[\s.,-]*[0-9]*)*)(?:\()?\s?((?:[\s.,-]*[0-9]*)+)(?:\))?)|(?:(?:\()?(\+)\s?((?:[\s.,-]*[0-9]*)+)(?:\))?))((?:[\s.,-]*[0-9]+)+))", doc, original_string) phone_fax_list2 = extract_regex(r"\D(\+91[\-\s]?)?[0]?(91)?[789]\d{9}\D", doc, original_string) for i in range(len(phone_fax_list2)): phone_fax_list2[i][1] = phone_fax_list2[i][1] + 1 phone_fax_list2[i][2] = phone_fax_list2[i][2] - 1 phone_fax_list2[i][0] = original_string[ phone_fax_list2[i][1]:phone_fax_list2[i][2]] phone_fax_list = [] for a in phone_fax_list1: phone_fax_list.append(a) for a in phone_fax_list2: phone_fax_list.append(a) for a in phone_fax_list1: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] for a in phone_fax_list2: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] #for a in indian_ph_no: # original_string = original_string[:a[1]]+'X'*(a[2]-a[1])+original_string[a[2]:] for a in usa_ph_no: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] # ** Extracting account details and other identification details and deidentifying them** doc = nlp_trained_model(original_string) pan_list = extract_regex(r"[A-Z]{5}\d{4}[A-Z]{1}", doc, original_string) passport_list = extract_regex(r"[A-Z]{1}\d{7}", doc, original_string) account_and_serial_list = extract_regex(r"\d{9,18}", doc, original_string) credit_card_list = extract_regex( r"\d{5}(\s|\-)\d{5}(\s|\-)\d{5}|\d{4}(\s|\-)\d{4}(\s|\-)\d{4}(\s|\-)\d{4}", doc, original_string) for a in pan_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] for a in passport_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] for a in account_and_serial_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] for a in credit_card_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] # ** Extracting MRN(Medical Report Number) if present and assumning it to be 7 digit** doc = nlp_trained_model(original_string) mrn_list = extract_regex(r"\d{7}", doc, original_string) for a in mrn_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] # Now we've deidentified all the details except address # ** For extracting address we use a list of address_identifiers for addresses smaller # than street names and match them with every element in spacy doc object. # Matched object are then added to our address_list ** address_identifier = [ 'st', 'niwas', 'aawas', 'palace', 'road', 'block', 'gali', 'sector', 'flr', 'floor', 'path', 'near', 'oppo', 'bazar', 'house', 'nagar', 'bypass', 'bhawan', 'street', 'rd', 'sq', 'flat', 'lane', 'gali', 'circle', 'bldg', 'ave', 'mandal', 'avenue', 'tower', 'nagar', 'marg', 'chowraha', 'lane', 'heights', 'plaza', 'park', 'garden', 'gate', 'villa', 'market', 'apartment', 'chowk' ] doc = nlp_trained_model(original_string) address_list = [] for i in doc: if (len(i) > 1 and '\n' not in str(i)): if (str(i).lower() in address_identifier): address_list.append(i) # ** Now to remove the identified addresses after getting their position in og_string address_index = [] temp_2 = 0 length = len(original_string) for i in address_list: while (1): index = original_string.find(str(i), temp_2, length) if (index == -1): break if (index != 0 and index != length): if ((original_string[index - 1].isalpha() or original_string[index + len(str(i))].isalpha())): temp_2 = index + len(str(i)) else: break address_index.append(index) temp_2 = index + len(str(i)) temp_1 = 0 new_address_list = [] if (address_index != []): temp_1 = address_index[0] a = [] for b in address_index: if (b - temp_1 < 20): a.append(b) temp_1 = b else: new_address_list.append(a) a = [] a.append(b) temp_1 = b new_address_list.append(a) # ** Removing the complete word in which the addres_identifier was used ** addr_list = [] for a in new_address_list: flag = [] j = a[0] while (j != -1 and original_string[j] not in [',', '\n', '.', ';']): j = j - 1 startt = j index_1 = startt count = 8 while (count and j != -1 and original_string[j] != '\n'): if (original_string[j].isdigit()): startt = j j = j - 1 count = count - 1 j = a[-1] #print(j) while (j != -1 and original_string[j] not in [',', '\n', '.', ';']): j = j + 1 endd = j index_2 = endd count = 7 while (count and j != length and original_string[j] != '\n'): if (original_string[j].isdigit()): endd = j j = j + 1 count = count - 1 if ((original_string[index_1] != '.' or original_string[index_2] != '.') and (index_2 - index_1) < 50): if (original_string[startt] == '\n'): startt = startt + 1 if (original_string[endd] == '\n'): endd = endd - 1 flag.append(original_string[startt:endd + 1]) flag.append(startt) flag.append(endd) addr_list.append(flag) for a in addr_list: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] # ** After deidentifying all these details we are now left with only names, dates, age # which cannot be identified by regular expression ** # To extract dates we use spacy's pre-trained en_core_web_sm model along with # some modifications to the default model according to our requirements time_identifier = [ 'YEAR', 'YEARS', 'AGE', 'AGES', 'MONTH', 'MONTHS', 'DECADE', 'CENTURY', 'WEEK', 'DAILY', 'DAY', 'DAYS', 'NIGHT', 'NIGHTS', 'WEEKLY', 'MONTHLY', 'YEARLY' ] doc_1 = nlp_trained_model(original_string) new_date_list = [] for entities in doc_1.ents: if (str(entities.text).count('X') < 2): date = [] if (entities.label_ == 'DATE' and (sum([ True if i not in original_string[entities.start_char:entities. end_char].upper() else False for i in time_identifier ]) == len(time_identifier)) and (entities.end_char - entities.start_char) > 4 and sum(c.isdigit() for c in original_string[entities.start_char:entities. end_char]) >= 1 and sum(c.isalpha() for c in original_string[entities.start_char:entities. end_char]) >= 1): date.append(entities.text) date.append(entities.start_char) date.append(entities.end_char) new_date_list.append(date) for a in new_date_list: count = 0 for i in range(a[1], a[1] + 4): if (original_string[i].isnumeric()): count = count + 1 if (count == 4): original_string = original_string[:a[1] + 4] + 'X' * ( a[2] - a[1] - 4) + original_string[a[2]:] else: count = 0 for j in range(a[2], a[2] - 5, -1): if (original_string[j].isnumeric()): count = count + 1 if (count == 4): original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1] - 4) + original_string[a[2] - 4:] elif (count == 3): original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1] - 2) + original_string[a[2] - 2:] else: original_string = original_string[:a[1]] + 'X' * ( a[2] - a[1]) + original_string[a[2]:] final_date_list = [] if (choice == 1): for a in new_date_list: final_date_list.append(a) for a in new_date_list: final_date_list.append(a) # final_date_list contains all the dates we extracted including regex and spacy model # ** Now going for age part, we use the spacy's phrasematcher # which takes input as patterns we want to match and # outputs the start and end index of matched pattern ** try: age_list = [] matcher = PhraseMatcher(nlp_trained_model.vocab, attr="SHAPE") age_identifier = [ 'YEAR', 'YEARS', 'Y/O', 'AGES', 'AGE', 'Y.O', 'Y.O.', 'AGED', 'AGE IS' ] matcher.add("age", None, nlp_blank_model("76 year old"), nlp_blank_model("aged 58"), nlp_blank_model('aged 123'), nlp_blank_model("54 y/o"), nlp_blank_model("age is 59"), nlp_blank_model("123 y/o"), nlp_blank_model("ages 35"), nlp_blank_model("age 45"), nlp_blank_model("ages 123"), nlp_blank_model("age 123"), nlp_blank_model("54 years old"), nlp_blank_model("124 years old"), nlp3("41 y.o."), nlp_blank_model("123 y.o."), nlp_blank_model('113 year old')) doc = nlp_blank_model(original_string) for match_id, start, end in matcher(doc): if (sum([ True if i in str(doc[start:end]).upper() else False for i in age_identifier ]) >= 1): a = [] for i in range(start, end): if (str(doc[i:i + 1]).isnumeric()): if (int(str(doc[i:i + 1])) > 89): result = st.find(str(doc[start:end])) count = 0 for j in range(result, result.len(str(doc[start:end]))): if (original_string[j:j + 1].isnumeric() and count == 0): sstart = j if (original_string[j:j + 1].isnumeric()): count = count + 1 a.append(original_string[sstart:sstart + count]) a.append(sstart) a.append(sstart + count) age_list.append(a) original_string = original_string[: sstart] + 'X' * count + original_string[ sstart + count:] except: None # ** Last step is packing all the extracted pattern in a dict info_dict = {} info_dict['date'] = final_date_list #info_dict['dob'] = dob_list info_dict['aadhar'] = aadhar_list info_dict['ssn'] = ssn_list info_dict['mail'] = mail_list info_dict['ip'] = ip_list info_dict['url'] = url_list info_dict['licence_plate'] = license_plate_list #info_dict['indian_ph_no'] = indian_ph_no info_dict['usa_ph_no'] = usa_ph_no info_dict['phone_fax'] = phone_fax_list info_dict['pan'] = pan_list info_dict['passport'] = passport_list info_dict['account_details'] = account_and_serial_list info_dict['credit_card'] = credit_card_list info_dict['age'] = age_list info_dict['address'] = addr_list info_dict['medical_report_no'] = mrn_list info_dict['date_shift'] = date_shift shift = random_value if (choice == 1): return (original_string, info_dict, None) else: return (original_string, info_dict, shift)
def test_phrase_matcher_contains(en_vocab): matcher = PhraseMatcher(en_vocab) matcher.add("TEST", None, Doc(en_vocab, words=["test"])) assert "TEST" in matcher assert "TEST2" not in matcher
def get_array_from_df_combined(self, df): rows = df.text.tolist() rows = [ t.replace("\n", "рдХ") for t in rows ] ## as spacy cannot handle consecutive newlines sep = " " text = sep.join(rows) if nlp.max_length < len(text): nlp.max_length = 1 + len(text) rows_token_indexes_in_text = list( np.cumsum([len(a) for a in nlp.tokenizer.pipe(rows)]) ) total_tokens = rows_token_indexes_in_text.pop() # assert(total_tokens == 1 + len(list(nlp.tokenizer.pipe([text]))[0]) ) - len(rows_token_indexes_in_text) def set_custom_boundaries(doc): for token_index in rows_token_indexes_in_text: doc[token_index].is_sent_start = True return doc nlp.add_pipe(set_custom_boundaries, before="tagger") doc = nlp(text) result_df = pd.DataFrame([], columns=["spacy_bin", "spacy_cat"]) ## bigint features if self.cat_features: spacy_bigint_attributes = ( self.spacy_vorn_attributes + self.spacy_vocab_attributes ) tokens_features_bigint = doc.to_array( spacy_bigint_attributes ).astype("object") for i in range(tokens_features_bigint.shape[0]): for j in range(tokens_features_bigint.shape[1]): tokens_features_bigint[i][j] = nlp.vocab[ tokens_features_bigint[i][j] ].text tokens_features_big = np.split( tokens_features_bigint, rows_token_indexes_in_text ) result_df["spacy_cat"] = tokens_features_big small_feat_list = [] ## smallint features if self.num_features: tokens_features_smallint = doc.to_array( self.spacy_num_attributes ).astype("int8") small_feat_list.append(tokens_features_smallint) ## gzt features if self.gzt_features: phrase_matcher = PhraseMatcher(nlp.vocab) gzt_attributes = [a.upper() for a in list(self.GZT_LISTS.keys())] gzt_index_map = dict() for i, a in enumerate(gzt_attributes): gzt_index_map[nlp.vocab.strings[a]] = i gzt_patterns = list() for label, terms in self.GZT_LISTS.items(): patterns = [nlp.make_doc(text) for text in terms] phrase_matcher.add(label.upper(), None, *patterns) gzt_matches = phrase_matcher(doc) token_gzt_features = np.zeros( shape=[len(doc), len(gzt_attributes)], dtype="int8" ) for match_id, start, end in gzt_matches: gzt_attribute_index = gzt_index_map[match_id] span = doc[start:end] if span is not None: for token in span: # print(token.i, token) token_gzt_features[token.i, gzt_attribute_index] = 1 small_feat_list.append(token_gzt_features) # tokens_features_small = np.concatenate((tokens_features_smallint, token_gzt_features), axis=1) if len(small_feat_list) > 0: tokens_features_small = np.hstack(small_feat_list) tokens_features_small = np.split( tokens_features_small, rows_token_indexes_in_text ) result_df["spacy_bin"] = tokens_features_small return result_df
def build_phrase_matcher(name: str, phrases: List[str]) -> PhraseMatcher: """Builds a PhraseMatcher object.""" matcher = PhraseMatcher(nlp.tokenizer.vocab) matcher.add(name, phrases) return matcher
def create_profile(text): name = text[1] text.remove(text[0]) text.remove(text[1]) text = str(text) text = text.lower() #below is the csv where we have all the keywords, you can customize your own loggingStart("CLASSIFYING KEYWORDS") keyword_dict = pd.read_csv('data.csv', encoding="ISO-8859-1") loggingEnd("CLASSIFYING KEYWORDS") loggingStart("CLASSIFYING ML BASED WORDS") ML_words = [ nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis=0) ] loggingEnd("CLASSIFYING ML BASED WORDS") loggingStart("CLASSIFYING DL BASED WORDS") DL_words = [ nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis=0) ] loggingEnd("CLASSIFYING DL BASED WORDS") loggingStart("CLASSIFYING PYTHON BASED WORDS") python_words = [ nlp(text) for text in keyword_dict['Python Language'].dropna(axis=0) ] loggingEnd("CLASSIFYING PYTHON BASED WORDS") loggingStart("CLASSIFYING WEB BASED WORDS") web_words = [nlp(text) for text in keyword_dict['Web'].dropna(axis=0)] loggingEnd("CLASSIFYING WEB BASED WORDS") loggingStart("CLASSIFYING CYBER SECURITY BASED WORDS") security_words = [ nlp(text) for text in keyword_dict['Cyber security'].dropna(axis=0) ] loggingEnd("CLASSIFYING CYBER SECURITY WORDS") loggingStart("MATCHING") matcher = PhraseMatcher(nlp.vocab) matcher.add('ML', None, *ML_words) matcher.add('DL', None, *DL_words) matcher.add('Web', None, *web_words) matcher.add('Python', None, *python_words) matcher.add('CS', None, *security_words) doc = nlp(text) d = [] matches = matcher(doc) for match_id, start, end in matches: rule_id = nlp.vocab.strings[ match_id] # get the unicode ID, i.e. 'COLOR' span = doc[start:end] # get the matched slice of the doc d.append((rule_id, span.text)) loggingEnd("MATCHING") keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i, j in Counter(d).items()) #print(str(keywords)) ## convertimg string of keywords to dataframe loggingStart("CONVERT TO DATAFRAMES") df = pd.read_csv(StringIO(keywords), names=['Keywords_List']) df1 = pd.DataFrame(df.Keywords_List.str.split(' ', 1).tolist(), columns=['Subject', 'Keyword']) df2 = pd.DataFrame(df1.Keyword.str.split('(', 1).tolist(), columns=['Keyword', 'Count']) df3 = pd.concat([df1['Subject'], df2['Keyword'], df2['Count']], axis=1) df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")")) label = list(df3['Subject']) loggingEnd("CONVERT TO DATAFRAMES") count = {} x = set(label) for i in x: y = label.count(i) count.update({y: i}) final = max(list(count.keys())) data = { 'Candidate Name': name, 'Subject': [count[final]], 'Count': [final] } df4 = pd.DataFrame(data) return (df4)
def get_validator_matches(text): match_ents.clear() matcher = Matcher(nlp.vocab) phraseMatcher = PhraseMatcher(nlp.vocab, attr="LEMMA") adverbPattern = [{"POS": "ADV"}] matcher.add("Adverbs", match_adverb, adverbPattern) adjectivePattern = [{"POS": "ADJ"}] matcher.add("Adjectives", match_adjective, adjectivePattern) pastTenseVerbPattern1 = [{"TAG": "VBD"}] pastTenseVerbPattern2 = [{"TAG": "VBN"}] matcher.add("Passive Voice", match_passive, pastTenseVerbPattern1, pastTenseVerbPattern2) infinitivePattern1 = [{"LOWER": "be"}, {"POS": "ADJ"}, {"POS": "ADP"}] infinitivePattern2 = [{"LOWER": "to"}, {"POS": "VERB"}] matcher.add("Infinitive", match_infinitive, infinitivePattern1, infinitivePattern2) pronounPattern = [{"POS": "PRON"}] matcher.add("Pronoun", match_pronoun, pronounPattern) indefiniteArticles = ["a", "an"] indefiniteArticlePatterns = [nlp(text) for text in indefiniteArticles] phraseMatcher.add("Indefinite Articles", match_indefinite_articles, *indefiniteArticlePatterns) vagueTerms = [ "some", "any", "allowable", "several", "many", "lot of", "a few", "almost always", "very nearly", "nearly", "about", "close to", "almost", "approximate" ] vagueTermsPatterns = [nlp(text) for text in vagueTerms] phraseMatcher.add("Vague Terms", match_vague_terms, *vagueTermsPatterns) escapeClauses = [ "so far as is possible", "as possible", "as little as possible", "where possible", "as much as possible", "if it should prove necessary", "if necessary", "to the extent necessary", "as appropriate", "as required", "to the extent practical", "if practicable" ] escapeClausesPatterns = [nlp(text) for text in escapeClauses] phraseMatcher.add("Escape Clauses", match_escape_clauses, *escapeClausesPatterns) openEndedClauses = ["including but not limitedd to", "etc", "and so on"] openEndedPatterns = [nlp(text) for text in openEndedClauses] phraseMatcher.add("Open Ended Clauses", match_open_ended_clauses, *openEndedPatterns) notTerms = ["not"] notPatterns = [nlp(text) for text in notTerms] phraseMatcher.add("Negations", match_negations, *notPatterns) universalQuantifiers = [ "all", "any", "both", "completely", "prompt", "fast", "minimum", "maximum", "optimum" ] universalPatterns = [nlp(text) for text in universalQuantifiers] phraseMatcher.add("Immeasurable Quantifiers", match_universal_quantifier, *universalPatterns) temporalDependencies = [ "eventually", "before", "when", "after", "as", "once", "earliest", "latest", "instantaneous", "simultaneous", "while", "at last" ] temporalPatterns = [nlp(text) for text in temporalDependencies] phraseMatcher.add("Temporal Dependencies", match_temporal, *temporalPatterns) doc = nlp(inputText) matches = matcher(doc) lowercaseDoc = nlp(inputText.lower()) phraseMatches = phraseMatcher(lowercaseDoc) match_ents.sort(key=lambda x: x["start"]) return match_ents
def __init__(self, nlp, terms, label): patterns = [nlp(term) for term in terms] self.term_list = terms self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(label, None, *patterns)
def __init__(self, nlp, terms): self.terms = terms self.matcher = PhraseMatcher(nlp.vocab) patterns = [nlp.make_doc(text) for text in terms] self.matcher.add("TerminologyList", None, *patterns) Doc.set_extension("phrase_matches", getter=self.matcher, force=True)
def create_web_dev_profile(file): text = pdfextract(file) text = str(text) text = text.replace("\\n", "") text = text.lower() #below is the csv where we have all the keywords, you can customize your own keyword_dict = pd.read_csv('web_developer_keywords.csv') keyword_total = list(keyword_dict.count()) global total_sum total_sum = 0 for i in keyword_total: total_sum = total_sum + i print('ee',total_sum) front_end = [nlp(text) for text in keyword_dict['Front End'].dropna(axis = 0)] back_end = [nlp(text) for text in keyword_dict['Back End'].dropna(axis = 0)] database = [nlp(text) for text in keyword_dict['Database'].dropna(axis = 0)] project = [nlp(text) for text in keyword_dict['Projects'].dropna(axis = 0)] frameworks = [nlp(text) for text in keyword_dict['Frameworks'].dropna(axis = 0)] #print(front_end) # print(back_end) #print(database) matcher = PhraseMatcher(nlp.vocab) matcher.add('FrontEnd', None, *front_end) matcher.add('BackEnd', None, *back_end) matcher.add('Database', None, *database) matcher.add('Projects', None, *project) matcher.add('Frameworks', None, *frameworks) doc = nlp(text) #print(doc) d = [] matches = matcher(doc) # print(matches) for match_id, start, end in matches: rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR' span = doc[start : end] # get the matched slice of the doc d.append((rule_id, span.text)) keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items()) ## convertimg string of keywords to dataframe df = pd.read_csv(StringIO(keywords),names = ['Keywords_List']) df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword']) df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count']) df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1) df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")")) base = os.path.basename(file) filename = os.path.splitext(base)[0] name = filename.split('_') name2 = name[0] name2 = name2.lower() ## converting str to dataframe name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name']) dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1) dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True) print(dataf) return(dataf)
def __get_country_matcher__(self): matcher = PhraseMatcher(self.nlp.vocab) countries = ['Czech Republic', 'Australia', 'Germany', 'Slovakia'] patterns = list(self.nlp.pipe(countries)) matcher.add('ANIMAL', None, *patterns) return matcher
def __get_animal_matcher__(self): matcher = PhraseMatcher(self.nlp.vocab) animals = ['dog', 'cat', 'mouse', 'dogs', 'cats', 'mice'] patterns = list(self.nlp.pipe(animals)) matcher.add('ANIMAL', None, *patterns) return matcher
print("Please wait whilst spaCy language library is loaded...") nlp = spacy.load('en_core_web_md') """ ////////////////////////////////////////////////////// Change global values for bad words here ////////////////////////////////////////////////////// """ BAD_STEM_WORDS_LIST = [ "you", "option", "accurate", "correct", "true", "can be", "only", "statement" ] BAD_OPTION_WORDS_LIST = ["only", "statement", "all of the above"] # Create spaCy PhraseMatchers (lowercase for case-insensitivity) dnd_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") dnd_term = ["Drag and drop the"] dnd_patterns = [nlp.make_doc(text) for text in dnd_term] dnd_matcher.add("TerminologyList", None, *dnd_patterns) canbe_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") canbe_term = ["can be"] canbe_patterns = [nlp.make_doc(text) for text in canbe_term] canbe_matcher.add("TerminologyList", None, *canbe_patterns) negative_matcher = Matcher(nlp.vocab) negative_matcher.add("NegativeList", None, [{ 'POS': 'VERB' }, { 'DEP': 'neg' }], [{
from spacy.matcher import Matcher from spacy.matcher import PhraseMatcher import datefinder from .skills_extract import workex_extract_skills #from dateparser.search import search_dates edu_stop_heading = "skills|declaration|personal|education|academ|activities|projects|objective|professional|summary|background|internship|technical|activities|exposure|achievement" base_path = os.path.dirname(__file__) nlp = spacy.load('en_core_web_sm') file = os.path.join(base_path, "titles_combined.txt") file = open(file, "r", encoding='utf-8') jobtitle = [line.strip().lower() for line in file] jobtitlematcher = PhraseMatcher(nlp.vocab) patterns = [ nlp.make_doc(text) for text in jobtitle if len(nlp.make_doc(text)) < 10 ] jobtitlematcher.add("Job title", None, *patterns) def extract_exp_section(terms, index_exp, heading_index): # temp_index_exp = index_exp # try: # index_exp = heading_index.index(index_exp) # except: # index_exp = 0 # line_text = "" # try: # if ((temp_index_exp+1) not in heading_index) or ((temp_index_exp+2) not in heading_index ) or ((temp_index_exp+3) not in heading_index ) :
# # show_ents(doc3) # from spacy.tokens import Span # doc = nlp(u'Tesla to build a BR factory for alot of money') # ORG = doc.vocab.strings[u'ORG'] # print(ORG) # # print(doc.ents) from spacy.tokens import Span doc = nlp(u'Our company created a brand new vacuum cleaner This new vacuum-cleaner is the best in show' u'This new vacuum-cleaner is the best in show') show_ents(doc) from spacy.matcher import PhraseMatcher encontrador = PhraseMatcher(nlp.vocab) lista_frase = ['vacuum cleaner', 'vacuum-cleaner'] padroes_frase = [nlp(text) for text in lista_frase] encontrador.add('novoproduto', None, *padroes_frase) found_matches = encontrador(doc) print(found_matches) from spacy.tokens import Span PROD = doc.vocab.strings[u'PRODUCT']#esse product é a tag da lista de tags que voce atribui para as palavras que quer adicionar print(found_matches) new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]#aquei nos vamos atribuir a a match1 que eh o atributo 2 e o match2 que eh o terceiro atributo , sendo respectivamente onde começa e termina a palvra que queremos adicionar doc.ents = list(doc.ents) + new_ents show_ents(doc) doc_encontra = nlp(u'Originally I paid $29.95 for this card, but now this card is much more expencive. It is now 50 dollars') test = len([ent for ent in doc_encontra.ents if ent.label_ == 'MONEY']) print(test)
def __init__(self, nlp, terms, label): patterns = [nlp.make_doc(text) for text in terms] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(label, None, *patterns)
# load pre-trained model base_path = os.path.dirname(__file__) nlp = spacy.load('en_core_web_sm') custom_nlp2 = spacy.load(os.path.join(base_path,"degree","model")) custom_nlp3 = spacy.load(os.path.join(base_path,"company_working","model")) # initialize matcher with a vocab matcher = Matcher(nlp.vocab) file = os.path.join(base_path,"titles_combined.txt") file = open(file, "r", encoding='utf-8') designation = [line.strip().lower() for line in file] designitionmatcher = PhraseMatcher(nlp.vocab) patterns = [nlp.make_doc(text) for text in designation if len(nlp.make_doc(text)) < 10] designitionmatcher.add("Job title", None, *patterns) file = os.path.join(base_path,"LINKEDIN_SKILLS_ORIGINAL.txt") file = open(file, "r", encoding='utf-8') skill = [line.strip().lower() for line in file] skillsmatcher = PhraseMatcher(nlp.vocab) patterns = [nlp.make_doc(text) for text in skill if len(nlp.make_doc(text)) < 10] skillsmatcher.add("Job title", None, *patterns) class resumeparse(object): objective = ( 'career goal',
def informal_word_detection(self, sent_list): """detection and replacement of informal words with formal words""" # get the punctuations for the manipulation punctuation_list = string.punctuation # define matchers used for replacement purpose matcher_rule = Matcher(nlp.vocab) matcher_phrase = PhraseMatcher(nlp.vocab) # define different types of verbs verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] # get the list of informal word list with open('Model/informal_word_list.txt', 'r') as file: informal_word_list = ["" + line.strip() + "" for line in file] # get the list of formal word list with open('Model/formal_word_list.txt', 'r') as file: formal_word_list = ["" + line.strip() + "" for line in file] phrase_list = list() for i in range(len(informal_word_list)): try: # get the words that matcher informal word list word = informal_word_list[i] # check whether the word length is 1 and it's a verb if len(word.split()) == 1 and str( nlp(word)[0].tag_) in verb_types: # apply the rule base matching # get the base verb of the selected verb pattern = [{'LEMMA': word}, {'IS_PUNCT': True, 'OP': '?'}] # match with according to matcher_rule matcher_rule.add(str(i), None, pattern) else: # assign the words to the list(phrase_list) that need to formalize with phrase matching technique phrase_list.append(word) except Exception: continue # tokenize the phrases phrase_patterns = [nlp(text) for text in phrase_list] # match with according to matcher_phrase concept - direct phrase replacement matcher_phrase.add('Informal word matcher', None, *phrase_patterns) for i in range(len(sent_list)): # sentence tokenized sentense = nlp(sent_list[i]) # check for matching with respect to rule base technique in the sentence matches_1 = matcher_rule(sentense) # check for matching with respect to phrase base technique in the sentence matches_2 = matcher_phrase(sentense) # unit the two matches into a single matches = matches_1 + matches_2 # sort the matches according to the occurrence of words in the original sentence # with the aim of preventing the complication due to availability of two matches matches.sort(key=lambda x: x[1]) if len(matches) != 0: try: new_sent = "" # declare variable for later use previous_end = None # get match the informal word with formal word for match in matches: # get the informal word of the related match in sentence informal_word = str(sentense[match[1]:match[2]]) # get the tag as word type - of single word match word_type = str(sentense[match[1]:match[2]][0].tag_) # as the informal word list is in base for check for the other possibilities of occurrence # (verb types) # if these conditions as match get them if not informal_word_list.__contains__( informal_word) and word_type in verb_types: # get the index of the base form of those words in informal list index = informal_word_list.index( sentense[match[1]:match[2]][0].lemma_) # get the respective formal word using index. # convert that formal word into initial word_type as detected(tenses) formal_word = getInflection( formal_word_list[index], tag=str(word_type))[0] # applies for the phrase base direct replacement else: index = informal_word_list.index(informal_word) formal_word = formal_word_list[index] # get the respective formal word upon the index # if it indicates a new sentence. if previous_end is None: new_sent = new_sent + str( sentense[:match[1]]).strip( ) + " " + formal_word # if next character is not a punctuation need to put a space if len(sentense) != match[2] and str(sentense[ match[2]]) not in punctuation_list: new_sent = new_sent + " " previous_end = match[2] else: previous_end = match[2] else: # continuation of sentence new_sent = new_sent + str( sentense[previous_end:match[1]]).strip( ) + " " + formal_word # if next character is not a punctuation need to put a space if len(sentense) != match[2] and str(sentense[ match[2]]) not in punctuation_list: new_sent = new_sent + " " previous_end = match[2] else: previous_end = match[2] new_sent = new_sent + str(sentense[previous_end:]).strip() sent_list[i] = new_sent.strip() except Exception: sent_list[i] = str(sentense) # for sent in sent_list: # print(sent) self.tense_conversion_obj.future_tense_det(sent_list)
def __init__(self, nlp, ontoDict): # add ontology and label from ontoDict self.ontoDict = ontoDict self.all_labels = "" # stop words, don't try to match these stopwords = nlp.Defaults.stop_words stopwords.add("ands") stopwords.add("ends") stopwords.add("ci") self.ontols = [] ontologies = ontoDict["ontologies"] for ontology in ontologies: for key, value in ontology.items(): if (key == "label"): self.all_labels = self.all_labels + value if (key == "ontology"): self.ontols.append(value) # print("self.ontols: ", self.ontols) # for x in self.ontols: # print("got x: ", x) # print("all_labels = ", self.all_labels) # for making plural forms of labels for text matching engine = inflect.engine() # init terms and patterns self.terms = {} patterns = [] #build unified table of all ID, IRI, Label and Synonyms: for ontol in self.ontols: #should be all ontols in print("checking ontol: ", ontol) for termid in ontol.get_classes(): # print("k is: ", k) termshortid = ontol.get_id_for_iri(termid) label = ontol.get_annotation(termid, RDFSLABEL) definition = ontol.get_annotation(termid, DEFINITION) if label: term_entry = { 'id': termid if termshortid is None else termshortid, 'name': label.strip(), 'definition': definition } if label is not None and label.strip().lower( ) not in stopwords: self.terms[label.strip().lower()] = term_entry patterns.append(nlp.make_doc(label.strip().lower())) plural = engine.plural(label.strip()) self.terms[plural.lower()] = term_entry patterns.append(nlp.make_doc(plural.lower())) synonyms = ontol.get_annotations(termid, SYN) for s in synonyms: # print("adding SYNONYM in ontotagtext: ", s) if s.strip().lower() not in stopwords: self.terms[s.strip().lower()] = term_entry patterns.append(nlp.make_doc(s.strip().lower())) try: plural = engine.plural(s.strip().lower()) self.terms[plural.lower()] = term_entry patterns.append(nlp.make_doc(plural.lower())) except: print("Problem getting plural of ", s) continue # initialize matcher and add patterns self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER') self.matcher.add(self.all_labels, None, *patterns) # set extensions to tokens, spans and docs Token.set_extension("is_ontol_term", default=False, force=True) Token.set_extension("ontol_id", default=False, force=True) Token.set_extension("merged_concept", default=False, force=True) Doc.set_extension("has_ontols", getter=self.has_ontols, force=True) Doc.set_extension("ontols", default=[], force=True) Span.set_extension("has_ontols", getter=self.has_ontols, force=True)
def mainpipe(inputfile, search_term, max_records, json_out, embvec, embvecache, val_ratio, rnnsize, batchsize, lr, weight_decay, n_epochs, model_save, es): if inputfile == 1: with open("input.txt", "r") as f: para = ast.literal_eval(f.read()) search_term = para['search_term'] max_records = para['max_records'] embvec = para['embvec'] embvecache = para['embvecache'] val_ratio = para['val_ratio'] rnnsize = para['rnnsize'] batchsize = para['batchsize'] lr = para['lr'] weight_decay = para['weight_decay'] n_epochs = para['n_epochs'] model_save = para['model_save'] if embvec == 1: embvec = torchtext.vocab.GloVe(name='840B', dim=300, cache=embvecache) use_pretrained = True with mlflow.start_run() as mlrun: pubmed = PubMed(tool="AlphabetH", email="*****@*****.**") query = search_term results = pubmed.query(query, max_results=max_records) pp = defaultdict(lambda: defaultdict(dict)) for art in results: pmed = art.pubmed_id try: pp[pmed]['title'] = art.title except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = art.abstract except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results except (AttributeError, TypeError): pass try: pp[pmed]['keywords'] = art.keywords except (AttributeError, TypeError): pass try: pp[pmed]['authors'] = art.authors except (AttributeError, TypeError): pass try: pp[pmed]['journal'] = art.journal except (AttributeError, TypeError): pass try: pp[pmed]['pubdate'] = str(art.publication_date.year) except (AttributeError, TypeError): pass try: pp[pmed]['conclusions'] = art.conclusions except (AttributeError, TypeError): pass print(subprocess.getoutput("python -m spacy download en_core_web_sm")) artpd = pd.DataFrame.from_dict(pp, orient='index') artpda = artpd[artpd.abstract.notnull()].copy() artpda = artpda[artpd.title.notnull()] # artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8]) artpdak = artpda[artpda.keywords.str.len() > 0].copy() dataf = pd.DataFrame( index=artpdak.index, columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey']) dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract dataf.loc[:, 'keywords'] = artpdak.keywords svoc = spacy.load("en_core_web_sm") matcher = PhraseMatcher(svoc.vocab, attr="LOWER") for pmid in dataf.index: t0 = dataf.loc[pmid] patterns = [svoc.make_doc(str(name)) for name in t0.keywords] matcher.add("Names", None, *patterns) doc = svoc(t0.SRC) t1 = ['O'] * (len(doc)) matched = [] matn = 0 for _, start, end in matcher(doc): t1[start] = 'B' t1[start + 1:end] = 'I' * (end - start - 1) if str(doc[start:end]).lower() not in matched: matn = matn + 1 matched.append(str(doc[start:end]).lower()) abskw = [] for x in t0.keywords: if x.lower() not in matched: abskw.append(x) dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1]) dataf.loc[pmid, 'Extracted'] = matn dataf.loc[pmid, 'abskey'] = abskw matcher.remove("Names") datatrain = dataf[dataf['Extracted'] >= 3].copy() datatest = dataf[dataf['Extracted'] < 3].copy() # separate train and validate dtrain = datatrain.loc[:, ['SRC', 'TRG']] dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']] seed = 250 idx = np.arange(datatrain.shape[0]) np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.shuffle(idx) val_size = int(len(idx) * val_ratio) df_train = dtrain.iloc[idx[val_size:], :] df_val = dtrain.iloc[idx[:val_size], :] df_val_k = dtraink.iloc[idx[:val_size], :] df_test = datatest.loc[:, ['SRC', 'TRG']] dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']] df_val_k = dtraink.iloc[idx[:val_size], :] # Load original dataset datai = artpda.copy() datai = datai[datai.abstract.notnull()] datai = datai[datai.title.notnull()] datai = datai.replace('\n', ' ', regex=True) datai = datai.replace('\t', ' ', regex=True) dataiu = datai.loc[datai.keywords.str.len() == 0] dataik = datai.loc[datai.keywords.str.len() > 0] dataiu['SRC'] = dataiu.title + ' ' + dataiu.abstract tokenizertrg = lambda x: x.split() def tokenizersrc(text): # create a tokenizer function return [tok.text for tok in svoc.tokenizer(text)] def safe_value(field_val): return field_val if not pd.isna(field_val) else "Other" def safe_year(field_val): return field_val if not pd.isna(field_val) else 1900 TEXT = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, lower=False) LABEL = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, unk_token=None) fields = [('text', TEXT), ('label', LABEL)] device = 'cuda' train_examples = read_data(df_train, fields, tokenizersrc, tokenizertrg) valid_examples = read_data(df_val, fields, tokenizersrc, tokenizertrg) # Load the pre-trained embeddings that come with the torchtext library. if use_pretrained: print('We are using pre-trained word embeddings.') TEXT.build_vocab(train_examples, vectors=embvec) else: print('We are training word embeddings from scratch.') TEXT.build_vocab(train_examples, max_size=5000) LABEL.build_vocab(train_examples) # Create one of the models defined above. #self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False) model0 = RNNCRFTagger(TEXT, LABEL, rnnsize, emb_dim=300, update_pretrained=False) model0.to(device) optimizer = torch.optim.Adam(model0.parameters(), lr=lr, weight_decay=weight_decay) train(train_examples, valid_examples, embvec, TEXT, LABEL, device, model0, batchsize, optimizer, n_epochs) out2 = evaltest2(df_val, df_val_k, model0, tokenizersrc, fields, device) ttp3 = kphperct(df_val_k, out2, svoc) mlflow.log_param("epochs", n_epochs) mlflow.pytorch.save_model(model0, model_save) mlflow.log_metric("extraction_rate", ttp3.mean()) augout = evaltest2(dataiu, model0, tokenizersrc, fields, device) klist = kphext2(dataiu.SRC, augout, svoc) for i in range(len(dataiu.index)): dataiu.iloc[i, 2].extend(list(set(klist[i]))) output = pd.concat([dataik, dataiu], join="inner") output.to_json('/home/pding/OneDrive/kph/MSaug.json', orient='index') if es == 1: output['journal'] = output['journal'].apply(safe_value) output['conclusions'] = output['conclusions'].apply(safe_value) output['pubdate'] = output['pubdate'].apply(safe_year) output['PMID'] = output.index test_server = [{'host': '127.0.0.1', 'port': 9200}] es = Elasticsearch(test_server, http_compress=True) use_these_keys = [ 'PMID', 'title', 'abstract', 'keywords', 'authors', 'pubdate' ] def filterKeys(document): return {key: document[key] for key in use_these_keys} def doc_generator(df): df_iter = df.iterrows() for index, document in df_iter: try: yield { "_index": 'ms', "_source": filterKeys(document), } except StopIteration: return helpers.bulk(es, doc_generator(output)) print(ttp3.mean())
def test_issue4373(): """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" matcher = Matcher(Vocab()) assert isinstance(matcher.vocab, Vocab) matcher = PhraseMatcher(Vocab()) assert isinstance(matcher.vocab, Vocab)
def test_matcher_phrase_matcher(en_vocab): doc = Doc(en_vocab, words=["Google", "Now"]) matcher = PhraseMatcher(en_vocab) matcher.add("COMPANY", None, doc) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) assert len(matcher(doc)) == 1
def tech_matcher_factory(nlp): matcher = PhraseMatcher(nlp.vocab, attr='LOWER') patterns = [nlp.make_doc(text) for text in _technology_terms] matcher.add("Phrase Matching", None, *patterns) return matcher
def create_profile(file): text = pdfextract(file) text = str(text) text = text.replace("\\n", "") text = text.lower() #below is the csv where we have all the keywords, you can customize your own keyword_dict = pd.read_csv('D:/NLP_Resume/resume/template_new.csv') stats_words = [ nlp(text) for text in keyword_dict['Statistics'].dropna(axis=0) ] NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis=0)] ML_words = [ nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis=0) ] DL_words = [ nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis=0) ] R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis=0)] python_words = [ nlp(text) for text in keyword_dict['Python Language'].dropna(axis=0) ] Data_Engineering_words = [ nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis=0) ] matcher = PhraseMatcher(nlp.vocab) matcher.add('Stats', None, *stats_words) matcher.add('NLP', None, *NLP_words) matcher.add('ML', None, *ML_words) matcher.add('DL', None, *DL_words) matcher.add('R', None, *R_words) matcher.add('Python', None, *python_words) matcher.add('DE', None, *Data_Engineering_words) doc = nlp(text) d = [] matches = matcher(doc) for match_id, start, end in matches: rule_id = nlp.vocab.strings[ match_id] # get the unicode ID, i.e. 'COLOR' span = doc[start:end] # get the matched slice of the doc d.append((rule_id, span.text)) keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i, j in Counter(d).items()) ## convertimg string of keywords to dataframe df = pd.read_csv(StringIO(keywords), names=['Keywords_List']) df1 = pd.DataFrame(df.Keywords_List.str.split(' ', 1).tolist(), columns=['Subject', 'Keyword']) df2 = pd.DataFrame(df1.Keyword.str.split('(', 1).tolist(), columns=['Keyword', 'Count']) df3 = pd.concat([df1['Subject'], df2['Keyword'], df2['Count']], axis=1) df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")")) base = os.path.basename(file) filename = os.path.splitext(base)[0] name = filename.split('_') name2 = name[0] name2 = name2.lower() ## converting str to dataframe name3 = pd.read_csv(StringIO(name2), names=['Candidate Name']) dataf = pd.concat([ name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count'] ], axis=1) dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace=True) return (dataf)
def create_profile(file): text = getDocxContent(file) text = str(text) text = text.replace("\\n", "") text = text.lower() #below is the csv where we have all the keywords, you can customize your own keyword_dict = pd.read_csv( 'D:/eclipse-workspace/ResumeParserUtilty/DataDictionary/AutomationProfileSearch.csv' ) AutomationTool = [ nlp(text) for text in keyword_dict['Automation tools'].dropna(axis=0) ] java_words = [ nlp(text) for text in keyword_dict['Java Language'].dropna(axis=0) ] bigdata_words = [ nlp(text) for text in keyword_dict['Big Data'].dropna(axis=0) ] JS_words = [ nlp(text) for text in keyword_dict['JS Lanaguage'].dropna(axis=0) ] python_words = [ nlp(text) for text in keyword_dict['Python Language'].dropna(axis=0) ] Data_Engineering_words = [ nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis=0) ] Bug_words = [ nlp(text) for text in keyword_dict['Bug Tracking Tools'].dropna(axis=0) ] test_words = [ nlp(text) for text in keyword_dict['Test Management Tool'].dropna(axis=0) ] Database_words = [ nlp(text) for text in keyword_dict['DataBase'].dropna(axis=0) ] matcher = PhraseMatcher(nlp.vocab) matcher.add('AutoTool', None, *AutomationTool) matcher.add('JAVA', None, *java_words) matcher.add('BigData', None, *bigdata_words) matcher.add('JS', None, *JS_words) matcher.add('Python', None, *python_words) matcher.add('DE', None, *Data_Engineering_words) matcher.add('JIRA', None, *Bug_words) matcher.add('TM', None, *test_words) matcher.add('DB', None, *Database_words) doc = nlp(text) d = [] matches = matcher(doc) for match_id, start, end in matches: rule_id = nlp.vocab.strings[ match_id] # get the unicode ID, i.e. 'COLOR' span = doc[start:end] # get the matched slice of the doc d.append((rule_id, span.text)) keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i, j in Counter(d).items()) ## convertimg string of keywords to dataframe df = pd.read_csv(StringIO(keywords), names=['Keywords_List']) df1 = pd.DataFrame(df.Keywords_List.str.split(' ', 1).tolist(), columns=['Subject', 'Keyword']) df2 = pd.DataFrame(df1.Keyword.str.split('(', 1).tolist(), columns=['Keyword', 'Count']) df3 = pd.concat([df1['Subject'], df2['Keyword'], df2['Count']], axis=1) df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")")) base = os.path.basename(file) filename = os.path.splitext(base)[0] name = filename.split('_') name2 = name[0] name2 = name2.lower() ## converting str to dataframe name3 = pd.read_csv(StringIO(name2), names=['Candidate Name']) dataf = pd.concat([ name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count'] ], axis=1) dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace=True) return (dataf)
def patternSearch(T_0, T, file, scoring_mode): current_patterns = [nlp(x) for x in T] phrase_matcher = PhraseMatcher(nlp.vocab) phrase_matcher.add('pattern search', None, *current_patterns) unranked_patterns = [] # find occurrences of seed phrases with open(file, "r") as f: file_chunk = partition(f) for document in file_chunk: print(len(document)) document = nlp(document) phrase_patterns = set() matches = phrase_matcher(document) for match_id, start, end in matches: p = tuple((start, end)) if p not in phrase_patterns: phrase_patterns.add(p) # find patterns around seed phrases for phrase_pattern in phrase_patterns: start = phrase_pattern[0] end = phrase_pattern[1] if (document[start - 1].text == '\n'): continue # add context pattern tmp = [] for i in range(2, 0, -1): if document[start - 1].tag_ == "IN": tmp.append({"TEXT": document[start - 1].text}) break tmp.append({"TEXT": document[start - i].text}) # add content pattern span = document[start:end] for token in span: tmp.append({"POS": token.pos_}) if tmp not in unranked_patterns: unranked_patterns.append(tmp) unranked_phrases = list(getPhrases(file, unranked_patterns)) l1, l2, l3, l4, m1, m2, m3, m4 = run_prdualrank(T_0, unranked_patterns, unranked_phrases, file) expanded_pattern_pre = [unranked_patterns[i] for i in l1] expanded_pattern_rec = [unranked_patterns[i] for i in l2] expanded_eid_pre = [unranked_phrases[i] for i in l3] expanded_eid_rec = [unranked_phrases[i] for i in l4] pattern2fscore = {} for i in range(len(unranked_patterns)): recall = m2[i] precision = m1[i] fscore = 0 if scoring_mode == 0: if (recall + precision) == 0: fscore = 0 else: fscore = ((2 * recall * precision) / (recall + precision)) elif scoring_mode == 1: fscore = precision elif scoring_mode == 2: fscore = recall elif scoring_mode == 3: fscore = precision * recall elif scoring_mode == 4: fscore = precision + recall else: fscore = -100 pattern2fscore[i] = fscore sorted_patterns_ids = sorted(pattern2fscore, key=pattern2fscore.__getitem__, reverse=True) sorted_patterns = [unranked_patterns[i] for i in sorted_patterns_ids] return sorted_patterns
def getsentencetense(doc,tense_uses): material_patterns = [nlp(text) for text in tense_uses] matcher = PhraseMatcher(nlp.vocab) matcher.add('tense', None, *material_patterns) matches = matcher(doc) return len(matches)!=0
if res_cat is not None: st.header(f"CLASSIFICATION") res_cat = res_cat.get('result')[0] st.subheader(f"> {res_cat.get('category').capitalize()} ({res[0].get('result')[0].get('score')})") # Extracted Entities if res_ner is not None and len(res_ner.get('result')) > 0: st.header("NAMED ENTITIES") df_ner = pd.read_json(json.dumps(res_ner.get('result'))) # Get value pairs as dict entity_names = {x:[] for x in df_ner.label} for x, y in zip(df_ner.label,df_ner.value): entity_names[x].append(y) entities = [*entity_names] # Create matcher matcher = PhraseMatcher(nlp.vocab) for key, value in entity_names.items(): patterns = [nlp(entity) for entity in value] matcher.add(key, None, *patterns) doc.ents = [ent for ent in list(doc.ents) if ent.label_ in entity_names] matches = matcher(doc) # Get matches in text for match_id, start, end in matches: rule_id = nlp.vocab.strings[match_id] span = doc[start : end] # Transform to spans and check for duplicates starts = [] for match_id, start, end in matches: rule_id = nlp.vocab.strings[match_id]
def test_phrase_matcher_basic_check(en_vocab): matcher = PhraseMatcher(en_vocab) # Potential mistake: pass in pattern instead of list of patterns pattern = Doc(en_vocab, words=["hello", "world"]) with pytest.raises(ValueError): matcher.add("TEST", pattern)