def main(): m = NameDataset() names = read_dict_file('eng_dictionary/names-from-forbes-wp_users.txt') not_names = read_dict_file( 'eng_dictionary/google-10000-english-no-names.txt') not_names.extend(read_dict_file('eng_dictionary/1000-no-names.txt')) names = sorted(set(names)) not_names = sorted(set(not_names)) # 0 => not a name # 1 => name targets = [] predictions = [] for q in names: predictions.append(m.search_first_name(q)) targets.append(True) for q in not_names: predictions.append(m.search_first_name(q)) targets.append(False) print('P', precision_score(y_true=targets, y_pred=predictions)) print('R', recall_score(y_true=targets, y_pred=predictions)) print('F', f1_score(y_true=targets, y_pred=predictions)) print('A', accuracy_score(y_true=targets, y_pred=predictions))
def namecheck(inp): m = NameDataset() count = 0 inp = str(inp) if m.search_first_name(inp) == False: if m.search_last_name(inp) == False: return False elif m.search_last_name(inp) == False: return False else: return True elif m.search_first_name(inp) == False: return False else: return True
def main(): m = NameDataset() if os.path.isfile(sys.argv[1]): words = read_dict_file(sys.argv[1]) else: words = [sys.argv[1]] # cheap word tokenizer. words = ' '.join(words).replace('.', ' ').replace('?', ' ').replace('\'', ' ').split(' ') output = '' for word in words: if m.search_first_name(word, use_upper_case=True): output += '\e[44m' output += word output += '\e[0m' elif m.search_last_name(word, use_upper_case=True): output += '\e[46m' output += word output += '\e[0m' else: output += word output += ' ' print(output)
def remove_name(input_text_or_list: Union[str, List[str]]) -> List[str]: """ Remove name in the input text """ name_searcher = NameDataset() if isinstance(input_text_or_list, str): tokens = word_tokenize(input_text_or_list) processed_tokens = [ token for token in tokens if (not name_searcher.search_first_name(token)) and ( not name_searcher.search_last_name(token)) ] else: processed_tokens = [ token for token in input_text_or_list if (not name_searcher.search_first_name(token)) and ( not name_searcher.search_last_name(token)) and token is not None and len(token) > 0 ] return processed_tokens
def calculate_query_TFIDF(query_string, inverted_index, num_files, profile): # List of words to remove words from profile text that appear often but have no bearing on user's likes/dislikes words_to_remove = ["birthday", "bday", "facebook", "lol", "thank", "christmas", "hanukkah", "happy"] # First we must preprocess the query (social media profile) m = NameDataset() tokens = nltk.word_tokenize(query_string) # Tokenizes the string using NLTK tokens = [x for x in tokens if x not in string.punctuation] # Don't include punctuation query_tokens = remove_stopwords(tokens) # Remove the stopwords # Only includes words that are: 1.) In English 2.) Not in words_to_remove 3.) Not a first name or last name query_tokens = [x for x in query_tokens if (wordnet.synsets(x) and x not in words_to_remove and not m.search_first_name(x)) and not m.search_last_name(x)] query_tokens = stem_words(query_tokens) # Stem words for preprocessing for i in range(0, len(query_tokens)): # Converts all tokens to lowercase query_tokens[i] = query_tokens[i].lower() query_tokens = [x for x in query_tokens if x != 'birthdai'] # Makes sure this common word doesn't appear query_appearances = collections.Counter() query_weights = [0] * len(inverted_index) # Initialize vector to hold query weights query_length = 0.0 l = list(inverted_index.keys()) # Gets list of tuples (query_term, index) for query_token in query_tokens: # Counter that keeps track of word appearances query_appearances[query_token] += 1 # Iterate through each term in the query vector and assign nonzero weight if the term appears in inverted index for query_term in query_appearances: if query_term in inverted_index: index_of_word = l.index(query_term) # Since ordered dict, calculate index of term num_postings = inverted_index[query_term].length + 0.0 # Document frequency idf = math.log10(num_files / num_postings) # Inverse document frequency tf = query_appearances[query_term] # Term frequency query_weights[index_of_word] = tf * idf # Query weight query_length += (tf * idf) * (tf * idf) # Update running total for query length query_length = math.sqrt(query_length) # Calculate final query length # Writes the query data to pickles pickle_out = open("data/"+profile+"/query_appearances.pickle", "wb") pickle.dump(query_appearances, pickle_out) pickle_out.close() pickle_out2 = open("data/" + profile + "/query_weights.pickle", "wb") pickle.dump(query_weights, pickle_out2) pickle_out2.close() return (query_weights, query_length, query_appearances) # Returns the tuple of necessary data
def main(): m = NameDataset() if os.path.isfile(sys.argv[1]): words = read_dict_file('generation/example_text.txt') else: words = [sys.argv[1]] words = ' '.join(words).replace('.', ' ').replace('?', ' ').split(' ') # cheap word tokenizer. output = '' for word in words: if m.search_first_name(word): output += '\e[44m' output += word output += '\e[0m' else: output += word output += ' ' print(output)
def main(): m = NameDataset() if os.path.isfile(sys.argv[1]): words = read_dict_file(sys.argv[1]) else: words = [sys.argv[1]] # cheap word tokenizer. words = ' '.join(words).replace('.', ' ').replace('?', ' ').replace('\'', ' ').split(' ') output = '' threshold = 5 for word in words: if m.search_first_name(word, use_upper_case=False) > threshold: output += '**' output += word.upper() output += '**' else: output += word output += ' ' print(output)
def fetch_training_data(keywords): frequent = TextClassifier.read_frequent_words() nd = NameDataset() #frequent.extend(TextClassifier.read_frequent_names()) # k is a list of keywords for a specific category training = [] for k in keywords: urls = TextClassifier.fetch_urls(k, 10) text = [] for url in urls: text.extend(TextClassifier.scrape_text(url)) text = [ t.lower() for t in text if t.lower() not in frequent and not nd.search_first_name(t) and not nd.search_last_name(t) ] print(text[0:100]) print(len(text)) training.append(text) return training
def wh_finder(self, entity_lst): # Stemmer porter = PorterStemmer() # All our question words wh_words = [ 'who', 'when', 'where', 'what', 'how many', 'what language', 'what percentage', 'which', 'what time' ] # Language model to detect languages language_model = open( os.path.join(settings.BASE_DIR, 'question_extractor/extract/models/language_model'), 'rb') language_model = pickle.load(language_model) # Location model to detect locations location_model = open( os.path.join(settings.BASE_DIR, 'question_extractor/extract/models/location_model'), 'rb') location_model = pickle.load(location_model) # Names dataset m = NameDataset() # Names Dataset i = 0 word_scores = self.word_scores maxxy = -10 while i < len(self.answer): # Stem and lower words word = porter.stem(self.answer[i][0].lower()) if word in language_model: # Checking for languages, usually tagged as JJ. Tag as NN. self.answer[i] = (self.answer[i][0], 'NN', self.answer[i][2]) # Check for non-stop word and Nouns and digits if self.answer[i][0].lower() not in stop_words and ( self.answer[i][1] == 'NNP' or self.answer[i][1] == 'NNPS' or self.answer[i][1] == 'NN' or self.answer[i][1] == 'NNS' or self.answer[i][1] == 'CD') and re.match(r'[\w]', word): # Maxxy is the highest score word, so get the most important if word_scores[word] > maxxy: if self.answer[i][0] not in entity_lst: maxxy = word_scores[word] j = i # Make sure sentence is not excluded due to duplicate entity else: replace = i i += 1 try: j = j except: j = replace word = self.answer[j] self.entity = word[0] self.position = j # If word is noun phrase, plural or noun if word[1] == 'NNP' or word[1] == 'NNPS' or word[1] == 'NN' or word[ 1] == 'NNS': # Four options; Location(Where), Language(What language), Name(who), other(what) # Location if word[0].lower() in location_model: self.wh_word = wh_words[2] self.type = 'location' # Language of some kind elif word[0].lower() in language_model: self.wh_word = wh_words[5] self.type = 'language' # Name elif m.search_first_name(word[0].lower()) or m.search_last_name( word[0].lower()): self.wh_word = wh_words[0] self.type = 'name' # What if non of above else: self.wh_word = wh_words[3] self.type = 'thing' # if word is digits. elif word[1] == 'CD': # Time if re.match(r'([12][\d][:.][0-6][\d])+(am|pm)?', word[0]): # Time self.wh_word = wh_words[8] # What time self.type = 'time' # Date elif re.match( r'[1,2,3][\d][/.][01][\d]?[\d]?[/.][10][\d][\d]?[\d]?', word[0]): #Date self.wh_word = wh_words[1] # When self.type = 'date' # % elif '%' in word[0]: self.wh_word = wh_words[6] # What % self.type = '%' # Day of month or placement elif re.match(r'[\d][\d]?(st|nd|rd|th)', word[0]): # Date with month months = [ "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december" ] if self.answer[j + 1][0].lower() in months or self.answer[ j + 2][0].lower() in months: self.wh_word = wh_words[1] # When self.type = 'day_of_month' else: self.wh_word = wh_words[7] # Which self.type = 'placement' # How many if non of above else: self.wh_word = wh_words[4] # How many self.type = 'quantity'
!pip install names-dataset from names_dataset import NameDataset from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stopwords_english from nltk.tokenize import word_tokenize import nltk nltk.download('punkt') m = NameDataset() movie_words = ["movie", "film", "plot", "begins", "opens", "starts", "piece", "named", "woman", "women", "man", "men", "prologue", "help", "helping"] stop_words = [] for i in range(5000): title = word_tokenize(corpus[i]) titlecopy = [] for word in title: if (m.search_first_name(word) or m.search_last_name(word) ) and word not in stop_words: stop_words.append(word) stop = list(stop_words) + list(movie_words) """## Μετατροπή σε TFIDF Το πρώτο βήμα θα είναι λοιπόν να μετατρέψετε το corpus σε αναπαράσταση tf-idf: """ from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() vectorizer.fit(corpus) corpus_tf_idf = vectorizer.transform(corpus) """Η συνάρτηση [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) όπως καλείται εδώ **δεν είναι βελτιστοποιημένη**. Οι επιλογές των μεθόδων και παραμέτρων της μπορεί να έχουν **δραματική επίδραση στην ποιότητα των συστάσεων** και είναι διαφορετικές για κάθε dataset. Επίσης, οι επιλογές αυτές έχουν πολύ μεγάλη επίδραση και στη **διαστατικότητα και όγκο των δεδομένων**. Η διαστατικότητα των δεδομένων με τη σειρά της θα έχει πολύ μεγάλη επίδραση στους **χρόνους εκπαίδευσης**, ιδιαίτερα στη δεύτερη εφαρμογή της άσκησης. Ανατρέξτε στα notebooks του εργαστηρίου και στο [FAQ](https://docs.google.com/document/d/1hou1gWXQuHAB7J2aV44xm_CtAWJ63q6Cu1V6OwyL_n0/edit?usp=sharing) των ασκήσεων.
def generate_potential_employees(f_path): """ open the email_employee.csv and parse it to create more employees """ f_name = os.path.join(f_path, 'email_employee.csv') nd = NameDataset() p = re.compile('([a-z]*)\.([a-z]\.)?([a-z]*)@enron.com') out_f_name = os.path.join(f_path, 'potential_email_employees.csv') num_rows_written = 0 with open(out_f_name, 'w', newline='') as out_f: field_names = ['address', 'firstName', 'lastName'] writer = csv.DictWriter(out_f, fieldnames=field_names) writer.writeheader() with open(f_name, 'r') as f: reader = csv.DictReader(f, quotechar='"') for row in reader: eid = row['eid'] email_id = row['address'] try: v = int(eid) except: v = 0 if v != 0: print("found eid: {} for email_id: {}, continuing".format( v, email_id)) continue ## match this email id m = p.match(email_id) if m is None: continue ## get the groups from this regex g = m.groups() ## get the first and last names fn = g[0] ln = g[-1] if not fn or not ln: print("invalid name: firstName: {}, lastName: {}".format( fn, ln)) continue d = {'address': email_id} ## search for the first name in the db fn_in_nd = nd.search_first_name(fn) if not fn_in_nd and nd.search_first_name(ln): ## swap the two d['firstName'] = ln d['lastName'] = fn else: ## continue as default d['firstName'] = fn d['lastName'] = ln ## write this to the output file as a row writer.writerow(d) num_rows_written += 1 print("wrote {} rows of data to the file: {}".format( num_rows_written, out_f_name))
with open('data/scraped_results_1615315764360.csv', 'r') as csv_file_input: with open('data/scraped_results_1615315764360-out.csv', 'w') as csv_file_output: csv_reader = csv.reader(csv_file_input, delimiter=',') writer = csv.writer(csv_file_output, lineterminator='\n') line_count = 0 for row in csv_reader: if line_count == 0: print(f'Column names are {", ".join(row)}') writer.writerow(row) line_count += 1 else: # print(f'\t{row[0]} works in the {row[1]} department, and was born in {row[2]}.') firstAndLastName = row[0].split() m = NameDataset() firstNameRes = m.search_first_name(firstAndLastName[0]) lastName = m.search_last_name(firstAndLastName[1]) if firstNameRes == True: print("First Name is Valid", firstNameRes) row[1] = firstAndLastName[0] if lastName == True: print("last name is valid", lastName) row[2] = firstAndLastName[1] writer.writerow(row) # print(firstNameRes, lastName) # print(firstAndLastName[0])
def is_name(word): m = NameDataset() return m.search_first_name(word) & m.search_last_name(word)
from names_dataset import NameDataset m = NameDataset() print(m.search_first_name('Brian')) print(m.search_last_name('Remy'))
for tweet in cleaned_tweets: # tokenizing tokenized_tweets.append(word_tokenize(tweet)) names_list = [] # removing stopwords from tokenized tweets stop_words.update( ["congratulations", "winner", "oscar", "oscars", "last", "night", "rt", "sent", "dm", "big", "best", "new", "tweet", "happy"]) for tweet in tokenized_tweets: temp = [] for word in set(tweet): if word.lower() not in stop_words: # checking if that word is a name of person if m.search_first_name(word.lower()) or m.search_last_name(word.lower()): temp.append(word.lower()) # joining first name and last name if len(temp) > 1: temp = [" ".join(temp)] # adding name in list if present if len(temp) != 0: if temp not in names_list: if isinstance(temp, list): names_list.append(temp[0]) else: names_list.append(temp) print("Names of winners: ") print(set(names_list))