def namecheck(inp): m = NameDataset() count = 0 inp = str(inp) if m.search_first_name(inp) == False: if m.search_last_name(inp) == False: return False elif m.search_last_name(inp) == False: return False else: return True elif m.search_first_name(inp) == False: return False else: return True
def main(): m = NameDataset() if os.path.isfile(sys.argv[1]): words = read_dict_file(sys.argv[1]) else: words = [sys.argv[1]] # cheap word tokenizer. words = ' '.join(words).replace('.', ' ').replace('?', ' ').replace('\'', ' ').split(' ') output = '' for word in words: if m.search_first_name(word, use_upper_case=True): output += '\e[44m' output += word output += '\e[0m' elif m.search_last_name(word, use_upper_case=True): output += '\e[46m' output += word output += '\e[0m' else: output += word output += ' ' print(output)
def remove_name(input_text_or_list: Union[str, List[str]]) -> List[str]: """ Remove name in the input text """ name_searcher = NameDataset() if isinstance(input_text_or_list, str): tokens = word_tokenize(input_text_or_list) processed_tokens = [ token for token in tokens if (not name_searcher.search_first_name(token)) and ( not name_searcher.search_last_name(token)) ] else: processed_tokens = [ token for token in input_text_or_list if (not name_searcher.search_first_name(token)) and ( not name_searcher.search_last_name(token)) and token is not None and len(token) > 0 ] return processed_tokens
def calculate_query_TFIDF(query_string, inverted_index, num_files, profile): # List of words to remove words from profile text that appear often but have no bearing on user's likes/dislikes words_to_remove = ["birthday", "bday", "facebook", "lol", "thank", "christmas", "hanukkah", "happy"] # First we must preprocess the query (social media profile) m = NameDataset() tokens = nltk.word_tokenize(query_string) # Tokenizes the string using NLTK tokens = [x for x in tokens if x not in string.punctuation] # Don't include punctuation query_tokens = remove_stopwords(tokens) # Remove the stopwords # Only includes words that are: 1.) In English 2.) Not in words_to_remove 3.) Not a first name or last name query_tokens = [x for x in query_tokens if (wordnet.synsets(x) and x not in words_to_remove and not m.search_first_name(x)) and not m.search_last_name(x)] query_tokens = stem_words(query_tokens) # Stem words for preprocessing for i in range(0, len(query_tokens)): # Converts all tokens to lowercase query_tokens[i] = query_tokens[i].lower() query_tokens = [x for x in query_tokens if x != 'birthdai'] # Makes sure this common word doesn't appear query_appearances = collections.Counter() query_weights = [0] * len(inverted_index) # Initialize vector to hold query weights query_length = 0.0 l = list(inverted_index.keys()) # Gets list of tuples (query_term, index) for query_token in query_tokens: # Counter that keeps track of word appearances query_appearances[query_token] += 1 # Iterate through each term in the query vector and assign nonzero weight if the term appears in inverted index for query_term in query_appearances: if query_term in inverted_index: index_of_word = l.index(query_term) # Since ordered dict, calculate index of term num_postings = inverted_index[query_term].length + 0.0 # Document frequency idf = math.log10(num_files / num_postings) # Inverse document frequency tf = query_appearances[query_term] # Term frequency query_weights[index_of_word] = tf * idf # Query weight query_length += (tf * idf) * (tf * idf) # Update running total for query length query_length = math.sqrt(query_length) # Calculate final query length # Writes the query data to pickles pickle_out = open("data/"+profile+"/query_appearances.pickle", "wb") pickle.dump(query_appearances, pickle_out) pickle_out.close() pickle_out2 = open("data/" + profile + "/query_weights.pickle", "wb") pickle.dump(query_weights, pickle_out2) pickle_out2.close() return (query_weights, query_length, query_appearances) # Returns the tuple of necessary data
def fetch_training_data(keywords): frequent = TextClassifier.read_frequent_words() nd = NameDataset() #frequent.extend(TextClassifier.read_frequent_names()) # k is a list of keywords for a specific category training = [] for k in keywords: urls = TextClassifier.fetch_urls(k, 10) text = [] for url in urls: text.extend(TextClassifier.scrape_text(url)) text = [ t.lower() for t in text if t.lower() not in frequent and not nd.search_first_name(t) and not nd.search_last_name(t) ] print(text[0:100]) print(len(text)) training.append(text) return training
def wh_finder(self, entity_lst): # Stemmer porter = PorterStemmer() # All our question words wh_words = [ 'who', 'when', 'where', 'what', 'how many', 'what language', 'what percentage', 'which', 'what time' ] # Language model to detect languages language_model = open( os.path.join(settings.BASE_DIR, 'question_extractor/extract/models/language_model'), 'rb') language_model = pickle.load(language_model) # Location model to detect locations location_model = open( os.path.join(settings.BASE_DIR, 'question_extractor/extract/models/location_model'), 'rb') location_model = pickle.load(location_model) # Names dataset m = NameDataset() # Names Dataset i = 0 word_scores = self.word_scores maxxy = -10 while i < len(self.answer): # Stem and lower words word = porter.stem(self.answer[i][0].lower()) if word in language_model: # Checking for languages, usually tagged as JJ. Tag as NN. self.answer[i] = (self.answer[i][0], 'NN', self.answer[i][2]) # Check for non-stop word and Nouns and digits if self.answer[i][0].lower() not in stop_words and ( self.answer[i][1] == 'NNP' or self.answer[i][1] == 'NNPS' or self.answer[i][1] == 'NN' or self.answer[i][1] == 'NNS' or self.answer[i][1] == 'CD') and re.match(r'[\w]', word): # Maxxy is the highest score word, so get the most important if word_scores[word] > maxxy: if self.answer[i][0] not in entity_lst: maxxy = word_scores[word] j = i # Make sure sentence is not excluded due to duplicate entity else: replace = i i += 1 try: j = j except: j = replace word = self.answer[j] self.entity = word[0] self.position = j # If word is noun phrase, plural or noun if word[1] == 'NNP' or word[1] == 'NNPS' or word[1] == 'NN' or word[ 1] == 'NNS': # Four options; Location(Where), Language(What language), Name(who), other(what) # Location if word[0].lower() in location_model: self.wh_word = wh_words[2] self.type = 'location' # Language of some kind elif word[0].lower() in language_model: self.wh_word = wh_words[5] self.type = 'language' # Name elif m.search_first_name(word[0].lower()) or m.search_last_name( word[0].lower()): self.wh_word = wh_words[0] self.type = 'name' # What if non of above else: self.wh_word = wh_words[3] self.type = 'thing' # if word is digits. elif word[1] == 'CD': # Time if re.match(r'([12][\d][:.][0-6][\d])+(am|pm)?', word[0]): # Time self.wh_word = wh_words[8] # What time self.type = 'time' # Date elif re.match( r'[1,2,3][\d][/.][01][\d]?[\d]?[/.][10][\d][\d]?[\d]?', word[0]): #Date self.wh_word = wh_words[1] # When self.type = 'date' # % elif '%' in word[0]: self.wh_word = wh_words[6] # What % self.type = '%' # Day of month or placement elif re.match(r'[\d][\d]?(st|nd|rd|th)', word[0]): # Date with month months = [ "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december" ] if self.answer[j + 1][0].lower() in months or self.answer[ j + 2][0].lower() in months: self.wh_word = wh_words[1] # When self.type = 'day_of_month' else: self.wh_word = wh_words[7] # Which self.type = 'placement' # How many if non of above else: self.wh_word = wh_words[4] # How many self.type = 'quantity'
!pip install names-dataset from names_dataset import NameDataset from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stopwords_english from nltk.tokenize import word_tokenize import nltk nltk.download('punkt') m = NameDataset() movie_words = ["movie", "film", "plot", "begins", "opens", "starts", "piece", "named", "woman", "women", "man", "men", "prologue", "help", "helping"] stop_words = [] for i in range(5000): title = word_tokenize(corpus[i]) titlecopy = [] for word in title: if (m.search_first_name(word) or m.search_last_name(word) ) and word not in stop_words: stop_words.append(word) stop = list(stop_words) + list(movie_words) """## Μετατροπή σε TFIDF Το πρώτο βήμα θα είναι λοιπόν να μετατρέψετε το corpus σε αναπαράσταση tf-idf: """ from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() vectorizer.fit(corpus) corpus_tf_idf = vectorizer.transform(corpus) """Η συνάρτηση [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) όπως καλείται εδώ **δεν είναι βελτιστοποιημένη**. Οι επιλογές των μεθόδων και παραμέτρων της μπορεί να έχουν **δραματική επίδραση στην ποιότητα των συστάσεων** και είναι διαφορετικές για κάθε dataset. Επίσης, οι επιλογές αυτές έχουν πολύ μεγάλη επίδραση και στη **διαστατικότητα και όγκο των δεδομένων**. Η διαστατικότητα των δεδομένων με τη σειρά της θα έχει πολύ μεγάλη επίδραση στους **χρόνους εκπαίδευσης**, ιδιαίτερα στη δεύτερη εφαρμογή της άσκησης. Ανατρέξτε στα notebooks του εργαστηρίου και στο [FAQ](https://docs.google.com/document/d/1hou1gWXQuHAB7J2aV44xm_CtAWJ63q6Cu1V6OwyL_n0/edit?usp=sharing) των ασκήσεων.
with open('data/scraped_results_1615315764360-out.csv', 'w') as csv_file_output: csv_reader = csv.reader(csv_file_input, delimiter=',') writer = csv.writer(csv_file_output, lineterminator='\n') line_count = 0 for row in csv_reader: if line_count == 0: print(f'Column names are {", ".join(row)}') writer.writerow(row) line_count += 1 else: # print(f'\t{row[0]} works in the {row[1]} department, and was born in {row[2]}.') firstAndLastName = row[0].split() m = NameDataset() firstNameRes = m.search_first_name(firstAndLastName[0]) lastName = m.search_last_name(firstAndLastName[1]) if firstNameRes == True: print("First Name is Valid", firstNameRes) row[1] = firstAndLastName[0] if lastName == True: print("last name is valid", lastName) row[2] = firstAndLastName[1] writer.writerow(row) # print(firstNameRes, lastName) # print(firstAndLastName[0]) line_count += 1
def is_name(word): m = NameDataset() return m.search_first_name(word) & m.search_last_name(word)
from names_dataset import NameDataset m = NameDataset() print(m.search_first_name('Brian')) print(m.search_last_name('Remy'))
for tweet in cleaned_tweets: # tokenizing tokenized_tweets.append(word_tokenize(tweet)) names_list = [] # removing stopwords from tokenized tweets stop_words.update( ["congratulations", "winner", "oscar", "oscars", "last", "night", "rt", "sent", "dm", "big", "best", "new", "tweet", "happy"]) for tweet in tokenized_tweets: temp = [] for word in set(tweet): if word.lower() not in stop_words: # checking if that word is a name of person if m.search_first_name(word.lower()) or m.search_last_name(word.lower()): temp.append(word.lower()) # joining first name and last name if len(temp) > 1: temp = [" ".join(temp)] # adding name in list if present if len(temp) != 0: if temp not in names_list: if isinstance(temp, list): names_list.append(temp[0]) else: names_list.append(temp) print("Names of winners: ") print(set(names_list))