def add_new_doc(self, document, documents_list_length=10000): """ This function perform indexing process for a document object. Saved information is captures via two dictionaries ('inverted index' and 'posting') :param document: a document need to be indexed. :return: - """ try: document_dictionary = document.term_doc_dictionary # self.countDoc += 1 for term in document_dictionary.keys(): if self.stemming == 'y': my_stemmer = Stemmer() term = my_stemmer.stem_term(term) # Update inverted index and posting if term not in self.inverted_idx.keys(): self.inverted_idx[term] = [ 1, [(document_dictionary[term], document.tweet_id)] ] # amount of doc, freq in the doc, doc id. else: self.inverted_idx[term][0] += 1 # amount of doc self.inverted_idx[term][1].append( (document_dictionary[term], document.tweet_id)) # freq in the doc # doc id if term not in self.postingDict.keys(): self.postingDict[term] = [(document.tweet_id, document_dictionary[term])] else: self.postingDict[term].append( (document.tweet_id, document_dictionary[term])) # self.countTweet -= 1 if document.tweet_id not in self.tweet_dict.keys(): self.tweet_dict[document.tweet_id] = [ [term, document_dictionary[term]], 1, 0 ] # [term,freq in tweet], amount of unique terms in tweet, amount of terms in tweet elif document_dictionary[term] > self.tweet_dict[ document.tweet_id][0][ 1]: # tweet exist, compering between freq in two terms if self.tweet_dict[document.tweet_id][0][ 1] == 1: # before change term check if the last term is unique self.tweet_dict[document.tweet_id][ 1] += 1 # last term is unique: add to the amount of uniqe terms in tweet self.tweet_dict[document.tweet_id][0] = [ term, document_dictionary[term] ] # change between the terms self.tweet_dict[document.tweet_id][2] += 1 elif document_dictionary[ term] == 1: # tweet exist, not most common, check if unique self.tweet_dict[document.tweet_id][1] += 1 self.tweet_dict[document.tweet_id][2] += 1 except: # print('problem in indexer : add_new_doc') # print(traceback.print_exc()) pass
def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] indice = doc_as_list[4] retweet_text = doc_as_list[5] retweet_url = doc_as_list[6] retweet_indice = doc_as_list[7] quote_text = doc_as_list[8] quote_url = doc_as_list[9] quoted_indice = doc_as_list[10] retweet_quoted_text = doc_as_list[11] retweet_quoted_url = doc_as_list[12] retweet_quoted_indice = doc_as_list[13] term_dict = {} tokenized_text = self.parse_sentence(full_text) tokenized_quote = self.parse_sentence(quote_text) tokenized_url = self.handle_url(url) doc_length = len( tokenized_text) # after text operations - length of full_text new_tokenized_text = tokenized_text + tokenized_url + tokenized_quote if self.stemming is True: s = Stemmer() for token in new_tokenized_text: new_tokenized_text.append(s.stem_term(token)) new_tokenized_text.remove(token) for term in new_tokenized_text: if term is not "": # or (term.isalpha() and len(term) == 1) if term not in term_dict: term_dict[term] = 1 else: term_dict[term] += 1 document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length) return document
class Parse: def __init__(self, stem): self.stop_words = stopwords.words('english') self.stop_words.extend([ 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than', 'rt', "don't", '-', '&', 'it’s', 'don’t', 'i’m', "it's", "doesn't", 'https', 't.co', 'twitter.com', 'weve', 'ur', 'due', 'damn', 'us', 'theyre', 'would', 'might' ]) self.stop_words_dict = { self.stop_words[i]: 0 for i in range(0, len(self.stop_words)) } # self.extra_stop_words = {"rt": 0, "https": 0, "t.co": 0, "twitter.com": 0, "weve": 0, "ur": 0, "due": 0, "damn": 0, "us": 0, "theyre": 0, "would": 0, "might": 0} # self.stop_words_dict.update(self.extra_stop_words) self.term_dict = {} self.toStem = stem self.text_tokens = [] if self.toStem: self.stemmer = Stemmer() def parse_sentence(self, text): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ text_splitted = text.split() # stop_words = stopwords.words('english') ##lancaster = LancasterStemmer() i = 0 while i < len(text_splitted): try: word = text_splitted[i].strip('[').strip(']').strip('(').strip( ')').strip('{').strip('}') word = re.sub('[^A-z-%_@#.,$!?/0-9]', '', word) if word[len(word) - 1] == '%': new_word = word[:len(word) - 1] if new_word.isdigit() or re.search( r'^-?[0-9]+\.[0-9]+$', new_word) or re.search( r'^-?[0-9]+\/[0-9]+$', new_word): number = self.parse_numbers(new_word) percent_number = str(number) + '%' self.text_tokens.append(percent_number) i += 1 continue else: word = re.sub('[^A-z.%0-9]', '', word) if word != '': self.text_tokens.append(word) i += 1 continue elif word.isdigit() or re.search( r'^-?[0-9]+\.[0-9]+$', word) or re.search( r'^-?[0-9]+\/[0-9]+$', word): if i < len(text_splitted) - 1: next_word = re.sub('[^A-z%_@#.,!?$/0-9]', '', text_splitted[i + 1]) number = self.parse_numbers(word, next_word) if number.endswith('K') or number.endswith( 'B') or number.endswith('M'): i += 1 elif (next_word == 'percent') or (next_word == 'percentage'): number = str(word) + '%' i += 1 self.text_tokens.append(number) i += 1 else: number = self.parse_numbers(word) self.text_tokens.append(number) i += 1 continue except: ## token is not a number word = re.sub('[^A-z-%_@#.,$!?/0-9]', '', text_splitted[i]) if word.startswith('http') or word.startswith('www'): i += 1 continue word = re.sub(r'([-?!/,.]+)', r',', word) words = word.split(',') for word in words: if (len(word) > 0) and ( word.isspace() == False) and word.lower() not in self.stop_words_dict: if (word[0] == '#'): word = word[1:] hashtags = word.split('#') for h in hashtags: h = re.sub('[^A-z_0-9]', '', h) if h != '': self.parse_hashtags(h) elif word[0] == '@': word = word[1:] tags = word.split('@') for t in tags: t = re.sub('[^A-z_0-9]', '', t) if t != '': self.parse_tags(t) elif word[0] == '"' or word[0] == "'" or word[ 0] == '‘' or word[0] == '’': iterations = self.parse_quote(word, i, text_splitted) i += iterations continue else: word = re.sub('[^A-Za-z$%0-9]', '', word) if word != '': if self.toStem: self.text_tokens.append( self.stemmer.stem_term((word))) else: self.text_tokens.append(word) i += 1 return self.text_tokens ##print(self.text_tokens) def parse_tags(self, word): temp = re.sub('[^A-Za-z$0-9]', '', word) if temp != '': t_word = '@' + str(word.lower()) self.text_tokens.append(t_word) def parse_quote(self, word, i, text_splitted): start_iterations = i word = str(word) if word[len(word) - 1] == '"' or word[len(word) - 1] == "'" or word[ len(word) - 1] == '‘' or word[len(word) - 1] == '’': self.text_tokens.append( word.upper().strip('"').strip('"').strip('‘')) else: quote = word while True: if i < len(text_splitted) - 1: next_word = re.sub('[^A-z%_@#.,!?$/0-9]', '', text_splitted[i + 1]) if len(next_word) == 0: i += 1 elif (next_word[len(next_word) - 1] == "'") or ( next_word[len(next_word) - 1] == '"' ) or (next_word[len(next_word) - 1] == '‘') and (next_word[len(next_word) - 1] == '’'): quote += ' ' + next_word self.text_tokens.append(quote.upper().strip('"').strip( "'").strip('‘').strip('’')) i += 1 break else: quote += ' ' + next_word i += 1 elif i == (len(text_splitted) - 1): self.text_tokens.append(quote.upper().strip('"').strip( "'").strip('‘').strip('’')) break return i - start_iterations + 1 def parse_hashtags(self, element): element = element.replace(' ', '') expanded = " ".join( [a for a in re.split('([A-Z][a-z]+)', element) if a]) hashtag_tokens = expanded.split(' ') for w in hashtag_tokens: if w != '' and '_' not in w: if self.toStem: self.text_tokens.append(self.stemmer.stem_term((w))) else: self.text_tokens.append(w) word = re.sub('[^A-z$_0-9]', '', element) temp = re.sub('[^A-Za-z%$0-9]', '', word) if temp != '': self.text_tokens.append('#' + element) def parse_url(self, url): name = '' for character in url: if character == ' ': break if ('a' <= character <= 'z') or ('A' <= character <= 'Z') or ( '0' <= character <= '9') or (character == '.'): name += character elif (len(name) > 1) or ((len(name) == 1) and ('a' <= name <= 'z') or ('A' <= name <= 'Z') or ('0' <= name <= '9')): ##if name.isdigit(): ## name = self.parse_numbers(name) if name.lower() not in self.stop_words_dict and name != ' ': if name not in self.term_dict: self.term_dict[name] = 1 else: self.term_dict[name] += 1 name = '' if (len(name) > 1) or ((len(name) == 1) and ('a' <= name <= 'z') or ('A' <= name <= 'Z') or ('0' <= name <= '9')): ##if name.isdigit(): ## name = self.parse_numbers(name) if name.lower() not in self.stop_words_dict and name != ' ': if name not in self.term_dict: self.term_dict[name] = 1 else: self.term_dict[name] += 1 def parse_numbers(self, item, next_i=''): r = ['', 'K', 'M', 'B'] if bool(re.search(r'^-?[0-9]+\.[0-9]+$', item)): return item elif bool(re.search(r'^-?[0-9]+\/[0-9]+$', next_i)) and float(item) <= 999: return item + ' ' + next_i elif bool(re.search(r'^-?[0-9]+\/[0-9]+$', item)): return item elif (next_i == "Thousand" or next_i == "thousand") and float(item) <= 9999: return item + "K" elif (next_i == "M" or next_i == "m" or next_i == "Million" or next_i == "million") and float(item) <= 9999: return item + "M" elif (next_i == "B" or next_i == "b" or next_i == "Billion" or next_i == "billion") and float(item) <= 9999: return item + "B" num = float(item) magnitude = 0 while abs(num) >= 1000: magnitude += 1 num /= 1000.0 if magnitude >= 3: break return str("%.3f" % num).rstrip("0").rstrip(".") + '' + str( r[magnitude]) def parse_doc(self, doc_as_list): """commi This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] retweet_text = doc_as_list[4] retweet_url = doc_as_list[5] quote_text = doc_as_list[6] quote_url = doc_as_list[7] self.term_dict = {} self.text_tokens = [] self.parse_sentence(full_text) doc_length = len(self.text_tokens) # after text operations.4 for term in self.text_tokens: if term not in self.term_dict: self.term_dict[term] = 1 else: self.term_dict[term] += 1 num_of_uniqe_terms = len(self.term_dict) max_tf = 0 for item in self.term_dict.values(): if item > max_tf: max_tf = item if (url is not None) and (url != '{}'): self.parse_url(url) if (quote_text is not None) and (quote_text != '{}'): self.parse_url(quote_text) str_retweet_url = str(retweet_url) url_retweet_url_index = str_retweet_url.find('https') if url_retweet_url_index != -1: url_retweet_url = str_retweet_url[url_retweet_url_index:] if (url_retweet_url is not None) and (url_retweet_url != '{}'): self.parse_url(url_retweet_url) document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, self.term_dict, doc_length, max_tf, num_of_uniqe_terms, self.text_tokens) return document
class Parse: """ Parsing, syntax analysis, or syntactic analysis is the process of analyzing a string of symbols, either in natural language, computer languages or data structures, conforming to the rules of a formal grammar. The term parsing comes from Latin pars """ def __init__(self, config=None): self.tmp_for_entites = {} self.stop_words = stopwords.words('english') + [ '?', '!', ',', '+', '-', '*', '/', '"', '.', '<', '>', '=', ':', '', '{', '{}', '}', '[', ']', '[]', 'are', 'and', 'an', 'at', 'am', 'a', 'even', 'every', 'everyone', 'rt', 'RT' ] self.global_dict = {} #value=number of docs self.post_dict = { } # key="word",value=[parquet name,index in parquet,tweet id,frequency in tweet,location in tweet,tf] self.entities = {} self.path_stop_words = [ 'RT', "rt", 'tweet', 'www', 'http', 'https', 'WWW' ] self.corona_list = [ "cov", 'corona', 'coronavirus', 'covid', 'covid19', 'covid 19', 'corona virus', 'virus corona', 'corona_virus', 'virus_corona', "virus" ] self.config = config self.trump = [ "donald", "donald trump", "trump donald", "president", "trump_donald", "donald_trump", "trump-donald", "donald-trump" ] self.stemmer = None if self.config.toStem: self.stemmer = Stemmer() def parse_sentence(self, sentence): if (sentence == None): return return self.tokenized_parse(sentence) def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ local_dict = { } # key="word",value=[parquet name,index in parquet,tweet id,frequency in tweet,location in tweet] tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] retweet_text = doc_as_list[4] retweet_url = doc_as_list[5] quote_text = doc_as_list[6] quote_url = doc_as_list[7] #if str(full_text).startswith("RT"): #if the tweet is RT and not hold more text (just share) pass # return False term_dict = {} url = self.parse_url(url) tokenized_text = self.tokenized_parse(full_text) + url doc_length = len(tokenized_text) # after text operations. unique_words = set() for i in range(doc_length): if len(tokenized_text[i]) <= 1: continue unique_words.add(tokenized_text[i]) term_dict = self.update_doc_dict(term_dict, tokenized_text[i].lower()) document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length) return document def update_entity_global_dict(self): tmp = sorted(self.entities.items(), key=lambda x: x[1], reverse=True) entity = [] for i in tmp: if tmp[i][1] < 2: entity = tmp[:i] for word in entity: if word[0] not in self.global_dict: self.global_dict[word[0]] = word[1] else: self.global_dict[word[0]] += word[1] self.entities.pop(word[0]) def update_entity_dict(self, term): if term in self.tmp_for_entites.keys(): self.tmp_for_entites[term] += 1 else: self.tmp_for_entites[term] = 1 def extand_contractions(self, word): ''' function extand contraction and Common Acronyms in Twitter :param word: :return: ''' contractions = { "ain't": "am not / are not", "aren't": "are not / am not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he had / he would", "he'd've": "he would have", "he'll": "he shall / he will", "he'll've": "he shall have / he will have", "he's": "he has / he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how has / how is", "i'd": "I had / I would", "i'd've": "I would have", "i'll": "I shall / I will", "i'll've": "I shall have / I will have", "i'm": "I am", "i've": "I have", "isn't": "is not", "it'd": "it had / it would", "it'd've": "it would have", "it'll": "it shall / it will", "it'll've": "it shall have / it will have", "it's": "it has / it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she had / she would", "she'd've": "she would have", "she'll": "she shall / she will", "she'll've": "she shall have / she will have", "she's": "she has / she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "so's": "so as / so is", "that'd": "that would / that had", "that'd've": "that would have", "that's": "that has / that is", "there'd": "there had / there would", "there'd've": "there would have", "there's": "there has / there is", "they'd": "they had / they would", "they'd've": "they would have", "they'll": "they shall / they will", "they'll've": "they shall have / they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we had / we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what shall / what will", "what'll've": "what shall have / what will have", "what're": "what are", "what's": "what has / what is", "what've": "what have", "when's": "when has / when is", "when've": "when have", "where'd": "where did", "where's": "where has / where is", "where've": "where have", "who'll": "who shall / who will", "who'll've": "who shall have / who will have", "who's": "who has / who is", "who've": "who have", "why's": "why has / why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you had / you would", "you'd've": "you would have", "you'll": "you shall / you will", "you'll've": "you shall have / you will have", "you're": "you are", "you've": "you have", "AFK": "Away From Keyboard", "BBIAB": "Be Back In A Bit", "BBL": "Be Back Later", "BBS ": "Be Back Soon", "BEG": "Big Evil Grin", "BRB": "Be Right Back", "BTW": "By The Way", "EG": "Evil Grin", "FISH": "First In, Still Here", "IDK": "I Don't Know", "IMO": "In My Opinion", "IRL": "In Real Life", "KISS": "Keep It Simple,Stupid", "LMK": "Let Me Know", "LOL": "Laughing Out Loud", "NYOB": " None of Your Business", "OFC ": "Of Course", "OMG ": "Oh My God", "PANS": "Pretty Awesome New Stuff", "PHAT": "Pretty, Hot, And Tempting", "POS ": "Parents Over Shoulder", "ROFL": "Rolling On the Floor Laughing", "SMH ": "Shaking My Head", "TTYL": "Talk To You Later", "YOLO": "You Only Live Once", "WTH ": "What The Heck", } if (word in contractions): return contractions[word] return word def deEmojify(self, text): "remove the emojipy" emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', text) def parse_url(self, url_string): """ This function takes a url_string from document and break it into to list of word : https://www.instagram.com/p/CD7fAPWs3WM/?igshid=o9kf0ugp1l8x ->[https, www, instagram.com, p, CD7fAPWs3WM, igshid, o9kf0ugp1l8x ] :param tag: Hashtag word from tweet. :return: list include spread world from the url . """ if str(url_string).__contains__('t.co') or str( url_string).__contains__('twitter') or len(url_string) < 3: return [] tmp_word = "" word_list = [url_string] url = url_string.replace("//", "/") for i in range(len(url)): if (url[i] == "/" or url[i] == "-" or url[i] == "_"): word_list.append(tmp_word) tmp_word = "" elif i != len(url) - 1: tmp_word = tmp_word + url[i] else: word_list.append(tmp_word) if len(word_list) > 2: word_list = word_list[2:] return word_list def truncate(self, number, digits) -> float: stepper = 10.0**digits return math.trunc(stepper * number) / stepper def fix_number(self, toc_text): """ convert 3000 ->3K 3,000,000->3m :param toc_text: get the tokenizerd text :return: """ for i in range(len(toc_text)): num = toc_text[i] num = num.replace(',', '') if (num.isnumeric()): flag = False for digit in range(len(num)): if (num[digit].isdigit() == False and num[digit] != '.'): flag = True if (flag): continue try: num = float(num) except: continue flag1 = False if (1000 <= num < 1000000): flag1 = True num = num / 1000 num = str(self.truncate(num, 3)) + "K" elif (1000000 <= num < 1000000000): flag1 = True num = num / 1000000 num = str(self.truncate(num, 3)) + "M" elif (num > 1000000000): flag1 = True num = num / 1000000000 num = str(self.truncate(num, 3)) + "B" num = str(num) if (flag1 == False): if (num[-1] == "0"): num = num[0:-1] if (num[-1] == "."): num = num[0:-1] if (flag): if (num[-2] == "0"): num = num[0:-2] + num[-1:] if (num[-1] == "."): num = num[0:-2] + num[-1:] toc_text[i] = num if (i + 1 == len(toc_text)): break else: if (toc_text[i + 1] == "Thousand" or toc_text[i + 1] == "thousand"): toc_text[i] = str(toc_text[i]) + "K" toc_text[i + 1] = "" elif (toc_text[i + 1] == "Million" or toc_text[i + 1] == "million"): toc_text[i] = str(toc_text[i]) + "M" toc_text[i + 1] = "" elif (toc_text[i + 1] == "Billion" or toc_text[i + 1] == "billion"): toc_text[i] = str(toc_text[i]) + "B" toc_text[i + 1] = "" return toc_text def update_doc_dict(self, term_dict, word): #try: if word not in term_dict: term_dict[word] = 1 else: #except: term_dict[word] += 1 return term_dict def update_global_dict(self, word): """ cheack if word in the dict if not save :param word: :return: """ if word not in self.global_dict: self.global_dict[word] = 1 else: self.global_dict[word] += 1 def Hashtags_parse(self, toc_text): """ This function takes a Hashtag world from document and break it into to list of word :param tag: Hashtag word from tweet. :return: list include spread world and #tag . """ copy_toc_text = [] for term in toc_text: copy_toc_text.append(term) count = 0 parseList = '' i = 0 for term in toc_text: count += 1 tag = term flag = True if (len(tag) <= 0 or tag[0] != '#'): continue parseList = tag[1:] parseList = str.replace(parseList, '_', '') #parseList = re.sub(r"([A-Z])", r" \1", parseList) #parseList=self.sub_by_upper(parseList) #secparseList = parseList.replace(' ', '') split_tag = self.sub_by_upper(parseList) + [ '#' + parseList.lower() ] if ('' in split_tag): split_tag.remove('') count -= 1 i = count + i for word in split_tag: copy_toc_text.insert(i, word) i += 1 if (i - count == len(split_tag)): copy_toc_text.remove(term) i = i - count # term_dict = self.update_doc_dict(term_dict, word) # if (flag): # flag = False # self.upper_lower_global_dict(word) return copy_toc_text def percent_parse(self, toc_text): """ This function change the representation of Number%,Number percent,Number percentage to Number% :param s: word from tweet. :return:string in Format Number% . """ percent_op = [' percentage', ' PERCENTAGE', ' PERCENT', ' percent'] for i in range(0, len(toc_text)): if (str.isnumeric(toc_text[i]) and i + 1 < len(toc_text) and toc_text[i + 1] in percent_op): toc_text[i] = toc_text[i] + '%' toc_text[i + 1] = "" #term_dict = self.update_doc_dict(term_dict, toc_text[i] + '%') #self.upper_lower_global_dict(toc_text[i] + '%') return toc_text def currency_parse(self, term): """ This function converting string currency to multiple ways to show it :param sentence: thw sentece we look up for currency show :return:same sentence with extends, $-->$,usd,us dollar . """ t = term.upper() currency_dict = { 'ALL': 'Albania Lek', 'AFN': 'Afghanistan Afghani', 'ARS': 'Argentina Peso', 'AWG': 'Aruba Guilder', 'AUD': 'Australia Dollar', 'AZN': 'Azerbaijan New Manat', 'BSD': 'Bahamas Dollar', 'BBD': 'Barbados Dollar', 'BDT': 'Bangladeshi taka', 'BYR': 'Belarus Ruble', 'BZD': 'Belize Dollar', 'BMD': 'Bermuda Dollar', 'BOB': 'Bolivia Boliviano', 'BAM': 'Bosnia and Herzegovina Convertible Marka', 'BWP': 'Botswana Pula', 'BGN': 'Bulgaria Lev', 'BRL': 'Brazil Real', 'BND': 'Brunei Darussalam Dollar', 'KHR': 'Cambodia Riel', 'CAD': 'Canada Dollar', 'KYD': 'Cayman Islands Dollar', 'CLP': 'Chile Peso', 'CNY': 'China Yuan Renminbi', 'COP': 'Colombia Peso', 'CRC': 'Costa Rica Colon', 'HRK': 'Croatia Kuna', 'CU': 'Cuba Peso', 'CZK': 'Czech Republic Koruna', 'DKK': 'Denmark Krone', 'DOP': 'Dominican Republic Peso', 'XCD': 'East Caribbean Dollar', 'EGP': 'Egypt Pound', 'SVC': 'El Salvador Colon', 'EEK': 'Estonia Kroon', 'EUR': 'Euro Member Countries', 'FKP': 'Falkland Islands (Malvinas) Pound', 'FJD': 'Fiji Dollar', 'GHC': 'Ghana Cedis', 'GIP': 'Gibraltar Pound', 'GTQ': 'Guatemala Quetzal', 'GGP': 'Guernsey Pound', 'GYD': 'Guyana Dollar', 'HNL': 'Honduras Lempira', 'HKD': 'Hong Kong Dollar', 'HUF': 'Hungary Forint', 'ISK': 'Iceland Krona', 'INR': 'India Rupee', 'IDR': 'Indonesia Rupiah', 'IRR': 'Iran Rial', 'IMP': 'Isle of Man Pound', 'ILS': 'Israel Shekel', 'JMD': 'Jamaica Dollar', 'JPY': 'Japan Yen', 'JEP': 'Jersey Pound', 'KZT': 'Kazakhstan Tenge', 'KPW': 'Korea (North) Won', 'KRW': 'Korea (South) Won', 'KGS': 'Kyrgyzstan Som', 'LAK': 'Laos Kip', 'LVL': 'Latvia Lat', 'LBP': 'Lebanon Pound', 'LRD': 'Liberia Dollar', 'LTL': 'Lithuania Litas', 'MKD': 'Macedonia Denar', 'MYR': 'Malaysia Ringgit', 'MUR': 'Mauritius Rupee', 'MXN': 'Mexico Peso', 'MNT': 'Mongolia Tughrik', 'MZN': 'Mozambique Metical', 'NAD': 'Namibia Dollar', 'NPR': 'Nepal Rupee', 'ANG': 'Netherlands Antilles Guilder', 'NZD': 'New Zealand Dollar', 'NIO': 'Nicaragua Cordoba', 'NGN': 'Nigeria Naira', 'NOK': 'Norway Krone', 'OMR': 'Oman Rial', 'PKR': 'Pakistan Rupee', 'PAB': 'Panama Balboa', 'PYG': 'Paraguay Guarani', 'PEN': 'Peru Nuevo Sol', 'PHP': 'Philippines Peso', 'PLN': 'Poland Zloty', 'QAR': 'Qatar Riyal', 'RON': 'Romania New Leu', 'RUB': 'Russia Ruble', 'SHP': 'Saint Helena Pound', 'SAR': 'Saudi Arabia Riyal', 'RSD': 'Serbia Dinar', 'SCR': 'Seychelles Rupee', 'SGD': 'Singapore Dollar', 'SBD': 'Solomon Islands Dollar', 'SOS': 'Somalia Shilling', 'ZAR': 'South Africa Rand', 'LKR': 'Sri Lanka Rupee', 'SEK': 'Sweden Krona', 'CHF': 'Switzerland Franc', 'SRD': 'Suriname Dollar', 'SYP': 'Syria Pound', 'TWD': 'Taiwan New Dollar', 'THB': 'Thailand Baht', 'TTD': 'Trinidad and Tobago Dollar', 'TRY': 'Turkey Lira', 'TRL': 'Turkey Lira', 'TVD': 'Tuvalu Dollar', 'UAH': 'Ukraine Hryvna', 'GBP': 'United Kingdom Pound', 'USD': 'United States Dollar', 'UYU': 'Uruguay Peso', 'UZS': 'Uzbekistan Som', 'VEF': 'Venezuela Bolivar', 'VND': 'Viet Nam Dong', 'YER': 'Yemen Rial', 'ZWD': 'Zimbabwe Dollar' } if t in currency_dict: return currency_dict[t] return term def update_post_dict(self, tweet_id, local_dict, term_dict, tweet_date): """ update the post dict :param tweet_id: tweet ID int :param local_dict: dict hold the loction :param term_dict: dict hold frequency :param tweet_date: :return: """ max_tf = max(term_dict.values()) for term in term_dict: tf = term_dict[term] / max(term_dict.values()) if term not in self.post_dict: self.post_dict[term] = [ [ tweet_id, term_dict[term], tf, local_dict[term][1], len(term_dict), max_tf, tweet_date ] ] #[ tweetID,trem preq,tf,term location,num uniqe terms in tweet,max_tf,date] else: self.post_dict[term].append([ tweet_id, term_dict[term], tf, local_dict[term][1], len(term_dict), max_tf, tweet_date ]) def get_global_dict(self): dict = self.global_dict self.global_dict = {} return dict def get_posting_dict(self): dict = self.post_dict self.post_dict = {} return dict def sub_by_upper(self, text): """ cut long word to lst that the first word start with upper :param text:long word :return: lst that the first word start with uppe """ parseList = [] tmp = [] word = "" for i in range(len(text)): if text[i].isupper(): tmp.append(i) for i in range(len(tmp) - 1): word = text[tmp[i]:tmp[i + 1]] parseList.append(word.lower()) if (len(tmp) > 0): text = text[tmp[-1]:] parseList.append(text.lower()) return parseList def update_entity_dict(self, term): """ update num of show of the entity :param term: :return: """ if term in self.tmp_for_entites.keys(): self.tmp_for_entites[term] += 1 else: self.tmp_for_entites[term] = 1 def find_entities(self, tokenized_text): """ if the function recognize up 2 word start with upper :param tokenized_text: list after tokenized :return: """ UPPER_letter = False tmp_entity = "" for idx, word in enumerate(tokenized_text): if len(word) < 1: continue elif len(tmp_entity.split()) >= 2: self.update_entity_dict(tmp_entity) tmp_entity = "" UPPER_letter = False elif word[0].isupper() and UPPER_letter == True: tmp_entity += " " + word if (idx == len(tokenized_text) - 1): self.update_entity_dict(tmp_entity) elif word[0].isupper() and UPPER_letter == False: UPPER_letter = True tmp_entity += word else: tmp_entity = "" def tokenized_parse(self, full_text): """ :param full_text: the original text :return: list of term without stop words+@term+ #terms without emojify """ full_text = self.deEmojify(full_text) tokenized_text = full_text.split(' ') tokenized_text_copy = [] for term in tokenized_text: if term.lower() in self.trump: tokenized_text_copy.append("trump") tokenized_text[tokenized_text.index(term)] = "trump" continue tokenized_text_copy.append(term) for i in tokenized_text: if i.lower( ) in self.stop_words or i in self.path_stop_words or i.startswith( "\n") or i.startswith( "https") or len(i) < 2: #remove from original tokenized_text_copy.remove(i) continue idx = tokenized_text_copy.index(i) if '.' in i: tokenized_text_copy[idx] = tokenized_text_copy[idx].replace( ".", '') if ',' in i: tokenized_text_copy[idx] = tokenized_text_copy[idx].replace( ",", '') tokenized_text_copy[idx] = self.extand_contractions( tokenized_text_copy[idx].lower()) tokenized_text_copy[idx] = self.currency_parse( tokenized_text_copy[idx]) tokenized_text = tokenized_text_copy # save #tag tokenized_text = self.Hashtags_parse(tokenized_text) # save numbers end with M K B tokenized_text = self.fix_number(tokenized_text) # save num% tokenized_text = self.percent_parse(tokenized_text) # save entity self.find_entities(tokenized_text) try: if self.stemmer != None: for i in range(len(tokenized_text)): tokenized_text[i] = self.stemmer.stem_term( tokenized_text[i]) except: pass return tokenized_text def get_entity_dict(self): dict = self.entities self.entities = {} return dict
class Parse: def __init__(self): self.stop_words = stopwords.words('english') self.dictionary_term_index = {} self.array_names_and_entities = {} self.porter_stemmer = Stemmer() def parse_sentence(self, text, stemmer=False): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ list_percent = ["percent", "Percent", "Percentage", "percentage"] self.array_names_and_entities = {} self.dictionary_index = {} text = text.replace("\n", ". ") text = self.ignore_emojis(text) array_text_space = text.split(" ") array_text_space = self.separate_words_with_dots(array_text_space) string_ans = "" array_size = range(len(array_text_space)) string_ans_index = 0 entities_url = [] # help us to replace the url to "" because in get_entities it returns parts of the url for word, idx in zip(array_text_space, array_size): ans = "" if word == '' or word == ' ': continue check_digit = self.isdigit(word) if len(word) < 2 and check_digit is False: continue if len(word) < 2 or self.is_ascii(word) is False: if check_digit is False: word = self.remove_panctuation(word) if self.is_ascii(word) is False or word == '' or word == " " or len( word) < 2 or word.lower() not in self.stop_words: continue if ans == "" and self.is_url(word): entities_url.append(word) if "t.co" in word: continue ans = self.parse_url(word) if ans == "": entities_url.remove(word) continue else: if ans == "" and len(word) < 2 and word[0] != '#' and self.is_ascii(word) and not self.isfloat(word): word = self.remove_panctuation(word) if ans == "" and word[0] == '#': temp_word = self.remove_panctuation(word) if temp_word == "" or temp_word == "#": continue ans = self.parse_hashtag(temp_word) elif ans == "" and word[0] == '@': ans = self.remove_panctuation(word) elif ans == "" and word in list_percent: if idx > 0 and self.isfloat(array_text_space[idx - 1]): ans = self.parse_percentage(array_text_space[idx - 1] + " " + word) string_ans = string_ans[:len(string_ans) - 1 - len(ans)] + string_ans[ len(string_ans) + len(word):] + " " else: ans = word elif ans == "" and (word.lstrip('-').isdigit() or self.isfloat(word.lstrip('-')) or self.isFraction( word.lstrip('-')) or word.replace('~', '').isdigit()): ans = self.convert_str_to_number(array_text_space, idx) if ans == "": pre_ans = self.remove_panctuation(word) if len(pre_ans) < 2: continue array_ans = pre_ans.split() continued_array = [] for word_array_ans in array_ans: splitted_word, is_number = self.split_word_to_numbers_strings(word_array_ans) if splitted_word == '': continue arr = splitted_word.split(" ") for spl in arr: if spl.lower() in self.stop_words or len(word_array_ans) < 2: continue if is_number or self.check_two_letters(spl): spl = self.remove_panctuation_special(spl) string_ans += self.add_to_dictionary(spl, string_ans_index) string_ans_index += len(word) + 1 continue else: string_ans += self.add_to_dictionary(word_array_ans.lower(), string_ans_index) string_ans_index += len(word) + 1 else: string_ans += self.add_to_dictionary(ans, string_ans_index) string_ans_index += len(word) + 1 self.get_name_and_entities(entities_url, array_text_space) array_parsed = string_ans.split() ans = [] for word in array_parsed: if word[0] != '#' and word[0] != '@': if self.check_two_letters(word): us_word = self.remove_panctuation_special(word) ans.append(us_word) continue ans.append(word) return ans, self.array_names_and_entities def separate_words_with_dots(self, array_text): new_text = "" length = range(len(array_text)) for i in length: word = array_text[i] if '.' not in word: if word == '': continue new_text += word + " " continue if "http" in word or "www" in word or "t.co" in word or self.isfloat(word): check_regular_point = word.split('.', 1) if check_regular_point[0] != '' and check_regular_point[1] != '' and self.is_url( check_regular_point[1]): new_text += check_regular_point[0] + '. ' + check_regular_point[1] continue if check_regular_point[1] == '': new_text += check_regular_point[0] + " " continue new_text += word + " " continue if self.check_two_letters(word): us_word = self.remove_panctuation_special(word) new_text += us_word + " " continue separate = str(word).split('.') new_text += separate[0] + ". " + separate[1] + " " return new_text.lstrip().split(" ") def is_url(self, text): ''' check if string is a url path :param text: url :return: boolean ''' regex = re.compile( r'^(?:http|ftp)s?://|(?:www)?.' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) return re.match(regex, text) is not None def add_to_dictionary(self, text, index): array_of_words = text.split(" ") ans = "" for word in array_of_words: ans += word + " " self.dictionary_index[word] = index if ans == "": return "" return ans def parse_hashtag(self, phrase): """" parser hash tag and lower the letters return array of string #stayAtHome -> ['#stayathome',stay,at,home] """ original_phrase = phrase pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])") if phrase[1].islower() and '_' not in original_phrase: phrase = phrase[:1] + phrase[1].upper() + phrase[2:] temp = pattern.findall(phrase) all_words = phrase[1:].split("_") for word in all_words: if word != phrase[1:] and word.lower() and word not in temp: temp.append(word) temp = [str_to_lower.lower() for str_to_lower in temp] # temp.insert(0, original_phrase[0:len(original_phrase)].lower().replace('_', '')) i = 0 len_temp = len(temp) while i < len_temp: if temp[i] in self.stop_words or len(temp[i]) < 2: temp[i] = '' i += 1 return " ".join(temp).lstrip().rstrip() def parse_url(self, string): """ parsing url path return an array of the components """ if string is not None: ans = string.split("/") ans_len = len(ans) remove_www = "" if ans_len > 0: for term in ans: remove_www += term.replace("www.", "") + " " ans[0] = ans[0].replace(ans[0], remove_www) string_without_stopword = "" length = range(len(ans)) ans_string = ans[0].split(" ") for word, idx in zip(ans_string, length): if word == '' or word == ' ': continue if len(word) < 2 or (len(word) > 0 and word[0] == '#'): continue if word not in self.stop_words or word.isnumeric(): if not self.is_url(word): word = self.remove_panctuation(word) string_without_stopword += word + " " return string_without_stopword.lstrip() else: return "" def isdigit(self, word): if "0" <= word <= "9": return True return False def isfloat(self, value): """ check if value is a float number :return: boolean """ try: float(value) return True except ValueError: return False def isFraction(self, token): """ check if value is a fraction number :return: boolean """ if '/' not in token: return False values = token.split('/') return all(i.isdigit() for i in values) def convert_str_to_number_kmb(self, word): """ check if value is a float number, and return the wanted number. etc: 1000->1K, 1013456->1.013M :return: boolean """ tmb = '' if word >= 1000000000 or word <= -1000000000: word = float(word / 1000000000) tmb = 'B' elif word >= 1000000 or word <= -1000000: word = float(word / 1000000) tmb = 'M' elif word >= 1000 or word <= -1000: word = float(word / 1000) tmb = 'K' ans = '{:0.3f}'.format(word) return '{0:g}'.format(float(ans)) + tmb def check_two_letters(self, word): if 0 < len(word) < 7 and (word.upper()[0] == 'U' and 'S' in word.upper()): start = word.upper().find('U') + 1 end = word.upper().find('S', start) dot = word[start:end] if dot == '.': return True def split_word_to_numbers_strings(self, word): try: if self.check_two_letters(word): us_word = self.remove_panctuation_special(word) return us_word, False res = re.findall(r'[A-Za-z]+|\d+', word) if len(res)==0: return '', False if len(word) > 0 and self.isfloat(res[0]): if len(res) > 1 and ( "thousand" in res[1].lower() or "million" in res[1].lower() or "billion" in res[1].lower() or "b" in res[1].lower() or "m" in res[1].lower() or "k" in res[1].lower()): if "thousand" in word.lower(): return word.replace(res[1], "K"), True if 'k' in word.lower(): return word.replace(res[1], "K"),True if "million" in word.lower(): return word.replace(res[1], "M"),True if 'm' in word.lower(): return word.replace(res[1], "M"),True if "billion" in word.lower(): return word.replace(res[1], "B"),True if 'b' in word.lower(): return word.replace(res[1], "B"),True else: return (" ".join(res).lstrip().rstrip(),True) else: is_number =False return (" ".join(res).lstrip().rstrip(),False) except: return word, False def convert_str_to_number(self, text_demo, idx): """ check every type of number and return it as a string. etc: 1K,1M,1B,-900,23/5,2020,2K :return: boolean """ help_minus = '' text_return = [] my_word = text_demo[idx] text_demo_length = len(text_demo) my_word = my_word.replace(",", "") if re.search('-', my_word): help_minus = '-' my_word = my_word.replace("-", "") if not self.isfloat(my_word): my_word = self.remove_panctuation(my_word) if self.isFraction(my_word): if idx + 1 == text_demo_length: return ''.join(help_minus + my_word) text_return = ''.join(help_minus + my_word) token_next = text_demo[idx + 1].lower() if token_next == "billion" or token_next == "billions": text_return += 'B' text_demo[idx + 1] = "" if token_next == "million" or token_next == "millions": text_return += 'M' text_demo[idx + 1] = "" if text_demo[idx + 1] == "thousand" or token_next == "thousands": text_return += 'K' text_demo[idx + 1] = "" return help_minus + ''.join(text_return) if my_word != '' and not math.isnan(float(my_word)): number = float(my_word) number_numerize = self.convert_str_to_number_kmb(number) if idx + 1 < len(text_demo): token_next = text_demo[idx + 1].lower() number_to_input = str(number_numerize) if token_next == "billion" or token_next == "billions": if 'K' in number_numerize or 'M' in number_numerize: number_to_input = (number_to_input.translate({ord('K'): None})) number_to_input = (number_to_input.translate({ord('M'): None})) text_return.append(my_word) else: text_return.append(str(number_numerize + 'B')) text_demo[idx + 1] = "" elif token_next == "million" or token_next == "millions": if 'K' in number_numerize: number_to_input = (number_to_input.translate({ord('K'): None})) text_return.append(number_to_input + 'B') else: number_to_input = str(number_numerize) text_return.append(number_to_input + 'M') text_demo[idx + 1] = "" elif token_next == "thousand" or token_next == "thousands": if 'K' in number_numerize: number_to_input = (number_to_input.translate({ord('K'): None})) text_return.append(number_to_input + 'M') elif 'M' in number_numerize: number_to_input = (number_to_input.translate({ord('M'): None})) text_return.append(number_to_input + 'B') else: text_return.append(number_to_input + 'K') text_demo[idx + 1] = "" elif 1000 > number > -1000: text_return.append(number_numerize) else: text_return.append(number_numerize) else: text_return.append(number_numerize) if 1900 < number < 2100 and help_minus == '': if '~' in text_demo[idx]: text_return.append(my_word) else: len_number = len(text_demo[idx]) if text_demo[idx][len_number - 1] == '.': res = my_word.replace('.','') text_return.append(res) else: text_return.append(text_demo[idx]) return help_minus + ' '.join(text_return) def ignore_emojis(self, text): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", flags=re.UNICODE) ans = emoji_pattern.sub(r'', text) return ans def is_ascii(self, s): ans = all(ord(c) < 128 or c == '…' or c == '’' or c == '³' or c == "¹⁹" for c in s) return ans def parse_percentage(self, string): """ change word to percent 100 percent -> 100% :param string: string to check if there is a percent within :return: array of converted strings """ return re.split('\s+', string)[0] + '%' def remove_panctuation_special(self, word): """ remove pancuations from word U.S (like U.S., or U.S.'s) :param word :return: word without panctuation """ if 'a' in word.lower(): temp = word[:5] to_pancuate = word.replace(temp, '') # word = word.lower().replace("u.s", '') word = temp + self.remove_panctuation(to_pancuate) return word else: temp = word[:3] to_pancuate = word.replace(temp, '') # word = word.lower().replace("u.s", '') word = temp + self.remove_panctuation(to_pancuate.lower()) return word def remove_panctuation(self, word): """ remove pancuations from word (like . or , or : ) :param word :return: word without panctuation """ if self.check_two_letters(word): #word = self.remove_panctuation_us(word) return word if re.match(r'[^@]+@[^@]+\.[^@]+', word): return word if "#" == word or "##" == word: return "" if word[-2:] == "'s" or word[-2:] == "’s" or word[-2:] == "`s": word = word.replace(word[-2:], "") smiles = [":)", ":(", ":-]", ":-)", ";)", ";-)", ":-(", ";(", ";-(", ":-P", ":P", ":p", ":-p"] for smile in smiles: if smile in word: word = word.replace(smile, "") if word in smiles: return '' if "\n" in word: word = word.replace("\n", " ") if '#' in word and word[0] != '#': word = word.replace("#", "") if '_' in word and '#' not in word: word = word.replace("_", "") if '@' in word and word[0] != '@': word = word.replace("@", "") word = word.replace("-", " ") word = word.replace("'", "") word = re.sub(r'[€£€4️⃣“”‘‼⑥²⁸¹❶❷❽②⑦&$~’.,!…|?,…:;^"{}*=+()⁰\/[\[\]]', '', word) return word def get_name_and_entities(self, entities_url, array_text_space): text = "" for word in array_text_space: if word == '' or word == '' or word[0] == '@' or word[0] == '#' or word == "RT": continue text += word + " " rx2 = re.compile(r'[A-Z][-a-zA-Z]+[1-9]*(?:\s+[A-Z][-a-zA-Z]+[1-9]*)*') matches = rx2.findall(text) tokinzed_entity_new = set() i = 0 for i in range(len(matches)): if len(str(matches[i]).split()) > 1: tokinzed_entity_new.add(str(matches[i])) i += 1 if "COVID 19" in text: tokinzed_entity_new.add("COVID 19") if "Covid 19" in text: tokinzed_entity_new.add("Covid 19") for word in tokinzed_entity_new: if word.lower() not in self.stop_words: all_places = [m.start() for m in re.finditer(word, text)] self.array_names_and_entities[word] = all_places return tokinzed_entity_new def parse_doc(self, doc_as_list, stemmer=False): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] indices = doc_as_list[4] retweet_text = doc_as_list[5] retweet_url = doc_as_list[6] retweet_indices = doc_as_list[7] quote_text = doc_as_list[8] quote_url = doc_as_list[9] quote_indices = doc_as_list[10] term_dict = {} entities_local_dict = {} array_url_parsed = [] url = str(url) rt = False if "RT" in full_text: rt = True tokenized_text, names_and_entities = self.parse_sentence(full_text, stemmer=False) doc_length = len(tokenized_text) # after text operations. if doc_length == 0: return None for term in tokenized_text: if len(term) < 2: continue elif term.isdigit() and len(term) > 3: continue if stemmer: term = self.porter_stemmer.stem_term(term) if term not in term_dict.keys(): term_dict[term] = 1 else: term_dict[term] += 1 for term in names_and_entities.keys(): if len(term) < 2: continue if term in self.stop_words: continue if term not in term_dict.keys(): term_dict[term] = 1 else: term_dict[term] += 1 document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, len(self.array_names_and_entities), rt, doc_length) return document
class Parse: __slots__ = [ 'word_dict', 'stemmer', 'stop_words', 'rules', 'spell', 'min_length' ] def __init__(self, config): self.word_dict = {} self.stemmer = Stemmer(config.stemming) self.stop_words = [ self.stemmer.stem_term(word) for word in stopwords.words('english') ] + ['rt', 't.co', 'https'] self.rules = config.parser_rules self.spell = SpellChecker() self.min_length = config.min_length # helper function for numberTostring-->return 3 digit after the point @staticmethod def round_down(n, decimals=0): multiplier = 10**decimals return math.floor(n * multiplier) / multiplier @staticmethod def isNumber(word): return '0' <= word[0] <= '9' def numberToString(self, num): if num < 1000: return str(num) elif 1000 <= num < 1000000: num = num / 1000 num = self.round_down(num, 3) if num == int(num): num = int(num) s = str(num) return s + 'K' elif 1000000 <= num < 1000000000: num = num / 1000000 num = self.round_down(num, 3) if num == int(num): num = int(num) s = str(num) return s + 'M' else: num = num / 1000000000 num = self.round_down(num, 3) if num == int(num): num = int(num) s = str(num) return s + 'B' # This function is "cleaning" the word,removing a ,!@$&*... that appear in start/end of word @staticmethod def strip_punctuations(word): if word == '$': return word start = 0 end = len(word) - 1 while start < len(word) and word[start] in (string.punctuation + '\n\t'): if word[start] == '@' or word[start] == '#' or word[start] == '"': break start += 1 while end >= 0 and word[end] in string.punctuation: if word[end] == '"' or word[end] == '$': break end -= 1 return word[start:end + 1] # This function clean the text-->remove if not exist in ascii table @staticmethod def removeEmojis(text): return text.encode('ascii', 'ignore').decode('ascii') # #stayAtHome--->['#stayAtHome', 'stay', 'At',Home] @staticmethod def hashtag(term): res = [term] start = 1 for i in range(2, len(term)): if term[i].isupper(): res.append(term[start:i]) start = i res.append(term[start:]) return res @staticmethod def URL(text): return [v for v in re.split('[://]|[/?]|[/]|[=]', text) if v] @staticmethod def extendURLs(document): url_map = json.loads(document[3]) url_indices = json.loads(document[4]) full_text = document[2] offset = 0 for index in url_indices: try: new_offset = offset + len(url_map[full_text[(index[0] + offset):(index[1] + offset)]]) - index[1] + \ index[0] full_text = full_text[:(index[0] + offset)] + url_map[ full_text[(index[0] + offset):( index[1] + offset)]] + full_text[(index[1] + offset):] offset = new_offset except: pass document[2] = full_text @staticmethod def add_or_inc(d, term): if not term: return elif term not in d: d[term] = 0 d[term] += 1 def add_to_dict(self, word): low_case = word.lower() if low_case in self.stop_words: return None if len(low_case) < self.min_length: return None if self.rules['capitals']: if low_case in self.word_dict.keys(): if word == low_case: self.word_dict[low_case].text = low_case else: self.word_dict[low_case] = Term(word) else: if low_case not in self.word_dict.keys(): self.word_dict[low_case] = Term(low_case) return self.word_dict[low_case] def add_entity_to_dict(self, word): low_case = word.lower() if low_case in self.stop_words: return None if low_case in self.word_dict.keys(): self.word_dict[low_case].numOfInterfaces += 1 if word == low_case: self.word_dict[low_case].text = low_case else: self.word_dict[low_case] = Term(word) self.word_dict[low_case].is_entity = True return self.word_dict[low_case] def Tokenize(self, text): output = {} if self.rules['spellcheck']: word_list = [ self.spell.correction(word) for word in [ self.stemmer.stem_term(self.strip_punctuations(word)) for word in text.split() ] if word ] else: word_list = [ word for word in [ self.stemmer.stem_term(self.strip_punctuations(word)) for word in text.split() ] if word ] size = len(word_list) # find all the quotes in this doc # re.findall() find all quotes and return a list of quotes without " " if self.rules['quotes']: quotes = [ self.add_to_dict('"{}"'.format(quote)) for quote in re.findall(r'"(.*?)"', text) ] for q in quotes: self.add_or_inc(output, q) # The main loop for i in range(size): word = word_list[i] if self.rules['entity']: if (i + 1) < size and 'A' <= word[ 0] <= 'Z' and 'A' <= word_list[i + 1][0] <= 'Z': j = i + 2 entity = word + ' ' + word_list[i + 1] self.add_or_inc(output, self.add_entity_to_dict(entity)) while j < size and 'A' <= word_list[j][0] <= 'Z': entity = entity + ' ' + word_list[j] self.add_or_inc(output, self.add_entity_to_dict(entity)) j += 1 if self.rules['less_more']: if (i + 1) < size and word.lower() in ['less', 'more']: new_term = f'{word} {word_list[i + 1]}' if word_list[i + 1].lower() == 'than' and i + 2 < size: new_term += f' {word_list[i + 2]}' self.add_or_inc(output, self.add_to_dict(new_term.lower())) if self.isNumber(word): if self.rules['number']: try: if i + 1 < size and word_list[i + 1].lower() in [ self.stemmer.stem_term('percent'), self.stemmer.stem_term('percentage') ]: i += 1 word += '%' elif i + 1 < size and word_list[i + 1].lower() in [ self.stemmer.stem_term('dollar'), self.stemmer.stem_term('dollars') ]: i += 1 word += '$' # check if the number is actually separate to 2 word: 35 3/5 elif i + 1 < size and self.isNumber( word_list[i + 1]) and '/' in word_list[i + 1]: word += ' ' + word_list[i + 1] # cases of Thousand=K Million=M Billion=B--->the function numberToString do it elif i + 1 < size and word_list[i + 1].lower( ) == self.stemmer.stem_term('thousand'): i += 1 word = self.numberToString(float(word) * 1000) elif i + 1 < size and word_list[i + 1].lower( ) == self.stemmer.stem_term('million'): i += 1 word = self.numberToString(float(word) * 1000000) elif i + 1 < size and word_list[i + 1].lower( ) == self.stemmer.stem_term('billion'): i += 1 word = self.numberToString( float(word) * 1000000000) else: word = self.numberToString(float(word)) except: pass self.add_or_inc(output, self.add_to_dict(word)) # hashtag elif word[0] == '#': if self.rules['hashtag']: for word in self.hashtag(word): self.add_or_inc(output, self.add_to_dict(word)) # URL elif word[0:4] == "http": if self.rules['url']: for word in self.URL(word): self.add_or_inc(output, self.add_to_dict(word)) # Tag elif word[0] == '@': if self.rules['tag']: self.add_or_inc(output, self.add_to_dict(word)) else: self.add_or_inc(output, self.add_to_dict(word)) return output def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-presetting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] retweet_text = doc_as_list[4] retweet_url = doc_as_list[5] quote_text = doc_as_list[6] quote_url = doc_as_list[7] if self.rules['ext_url']: self.extendURLs(doc_as_list) full_text = doc_as_list[2] if self.rules['emoji']: full_text = self.removeEmojis(full_text) full_text = full_text.replace('\n', ' ') term_dict = self.Tokenize(full_text) doc_length = sum(term_dict.values()) max_word = max(term_dict.values()) document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length, max_word) return document
class Parse: # CONSTANTS KBM_SHORTCUTS = { "k": None, "m": None, "b": None, "K": None, "M": None, "B": None } MONTHS_DICT = {"Jul": ("july", "07"), "Aug": ("august", "08")} DAYS_DICT = { "Sat": "saturday", "Sun": "sunday", "Mon": "monday", "Tue": "tuesday", "Wed": "wednsday", "Thu": "thursday", "Fri": "friday" } RIGHT_SLASH_PATTERN = re.compile(r'^-?[0-9]+\\0*[1-9][0-9]*$') LEFT_SLASH_PATTERN = re.compile(r'^-?[0-9]+/0*[1-9][0-9]*$') NON_LATIN_PATTERN = re.compile( pattern= r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]') HASHTAG_SPLIT_PATTERN = re.compile( r'[a-zA-Z0-9](?:[a-z0-9]+|[A-Z0-9]*(?=[A-Z]|$))') COVID_DICT = { 'covid': None, 'covid-19': None, 'coronavirus': None, 'covid19': None, 'chinavirus': None, '#covid19': None } def __init__(self, stemming): self.stop_words = stopwords.words('english') self.stop_words.extend([ 'rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m', '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re', r' ', r'', r"", r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`", r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '.', r'\'d', '-', '--', 'mask', 'pandemic', 'people', 'wear', 'trump', 'masks', 'new', 'virus', 'wearing', 'cases', 'amp', 'us', 'like' ]) # , 'covid', '19', 'covid-19', 'mask', 'coronavirus', 'pandemic', 'people', 'wear', 'trump', 'covid19', 'masks', 'new', 'virus', 'wearing', 'cases', 'amp', '#covid19', 'us', 'like' self.stop_words_dict = dict.fromkeys(self.stop_words) self.text_tokens = None self.stemmer = None if stemming: self.stemmer = Stemmer() def parse_sentence(self, text): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :param capital_letter_indexer: dictionary for words with capital letters :param named_entities: dictionary for named entities in doc :return: """ self.text_tokens = word_tokenize(text) tokenized_list = [] entity_chunk = '' empty_chunk = 0 capital_letter_indexer = {} named_entities = set() for idx, token in enumerate(self.text_tokens): if token.lower() in self.stop_words_dict or (len(token) == 1 and ord(token) > 126): continue if token == '@' and len(self.text_tokens) > idx + 1: self.text_tokens[idx + 1] = '' continue c1 = token[0] if (ord(c1) < 48 or 57 < ord(c1) < 65 or 90 < ord(c1) < 97 or 122 < ord(c1)) and c1 != '#': continue if token in self.COVID_DICT: tokenized_list.append('covid') continue if len(token) > 0 and token[0].isupper(): # chunks entities together. entity_chunk += token + " " empty_chunk += 1 else: # add entity to the global counter and to the current words set if entity_chunk != '': named_entities.add(entity_chunk[:-1]) if empty_chunk > 1: tokenized_list.append(entity_chunk[:-1].lower()) entity_chunk = '' empty_chunk = 0 if token == '#': self.handle_hashtags(tokenized_list, idx) elif self.is_fraction(token): self.handle_fraction(tokenized_list, token, idx) elif token in ["%", "percent", "percentage"]: self.handle_percent(tokenized_list, idx) elif token.isnumeric() or "," in token: self.handle_number(tokenized_list, idx, token) elif '-' in token and len(token) > 1: self.handle_dashes(tokenized_list, token) elif token == 'https' and idx + 2 < len(self.text_tokens): # Will enter only if there are no urls in the dictionaries. splitted_trl = self.split_url(self.text_tokens[idx + 2]) tokenized_list.extend([x.lower() for x in splitted_trl]) self.text_tokens[idx + 2] = '' elif token[ -1] in self.KBM_SHORTCUTS and self.convert_string_to_float( token[:-1]): tokenized_list.append(token.upper()) else: if self.stemmer is not None: token = self.stemmer.stem_term(token) self.append_to_tokenized(tokenized_list, capital_letter_indexer, token) return tokenized_list, capital_letter_indexer, named_entities def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ if len(doc_as_list) > 0: tweet_id = int(doc_as_list[0]) else: tweet_id = None if len(doc_as_list) > 1: tweet_date = doc_as_list[1] else: tweet_date = None if len(doc_as_list) > 2: full_text = doc_as_list[2] else: full_text = None if len(doc_as_list) > 3: url = self.json_convert_string_to_object(doc_as_list[3]) else: url = None if len(doc_as_list) > 6: retweet_url = self.json_convert_string_to_object(doc_as_list[6]) else: retweet_url = None if len(doc_as_list) > 8: quote_text = doc_as_list[8] else: quote_text = None if len(doc_as_list) > 9: quote_url = self.json_convert_string_to_object(doc_as_list[9]) else: quote_url = None if len(doc_as_list) > 12: retweet_quoted_url = self.json_convert_string_to_object( doc_as_list[12]) else: retweet_quoted_url = None if full_text is None or tweet_id is None or tweet_date is None: return None dict_list = [url, retweet_url, quote_url, retweet_quoted_url] max_tf = 0 # if tweet_id in [1291243586835472384, 1291188776493080576, 1291180630315868162, 1291329776444112902, 1291356400829038592]: # print() urls_set = set() try: # holds all URLs in one place for d in dict_list: if d is not None: for key in d.keys(): if key is not None and d[key] is not None: urls_set.add(d[key]) except: urls_set = set() if quote_text is not None: full_text = full_text + " " + quote_text # removes redundant short URLs from full_text if len(urls_set) > 0: full_text = self.clean_text_from_urls(full_text) # takes off non-latin words. full_text = re.sub(self.NON_LATIN_PATTERN, u'', full_text) if len(full_text) == 0: return None tokenized_text, capital_letter_indexer, named_entities = self.parse_sentence( full_text) if len(tokenized_text) == 0: return None # tokenized_text.extend([x.lower() for x in self.handle_dates(tweet_date)]) # expends the full text with tokenized urls self.expand_tokenized_with_url_set(tokenized_text, urls_set) term_dict = {} doc_length = len(tokenized_text) # after text operations. for idx, term in enumerate(tokenized_text): if term not in term_dict.keys(): # holding term's locations at current tweet term_dict[term] = 1 else: term_dict[term] += 1 if term_dict[term] > max_tf: max_tf = term_dict[term] tweet_date = datetime.strptime(tweet_date, '%a %b %d %X %z %Y') document = Document(tweet_id, tweet_date, term_dict, doc_length, max_tf, len(term_dict), capital_letter_indexer, named_entities) return document def handle_hashtags(self, tokenized_list, idx): """ merges text_tokens[idx] with text_tokens[idx+1] such that '#','exampleText' becomes '#exampleText' and inserts 'example' and 'Text' to text_tokens :param tokenized_list: list that the terms will be appended to :param idx: index of # in text_tokens :return: """ if len(self.text_tokens) > idx + 1: splitted_hashtags = self.hashtag_split(self.text_tokens[idx + 1]) # tokenized_list.append((self.text_tokens[idx] + self.text_tokens[idx + 1]).lower()) tokenized_list.extend([ x.lower() for x in splitted_hashtags if x.lower() not in self.stop_words_dict ]) self.text_tokens[idx + 1] = '' def handle_tags(self, tokenized_list, idx): """ merges text_tokens[idx] with text_tokens[idx+1] such that '@','example' becomes '@example' :param tokenized_list: list of tokenized words :param idx: index of @ in text_tokens """ if len(self.text_tokens) > idx + 1: # tokenized_list.append((self.text_tokens[idx] + self.text_tokens[idx + 1]).lower()) # self.text_tokens[idx] = '' self.text_tokens[idx + 1] = '' def hashtag_split(self, tag): """ splits a multi-word hash-tag to a list of its words :param tag: single hash-tag string :return: list of words in tag """ return re.findall(self.HASHTAG_SPLIT_PATTERN, tag) def handle_percent(self, tokenized_list, idx): """ merges text_tokens[idx] with text_tokens[idx-1] such that "%"/"percent"/"percentage",'50' becomes '50%' :param tokenized_list: list of tokenized words :param idx: index of % in text_tokens :return: """ if idx is not 0: dash_idx = self.text_tokens[idx - 1].find('-') if self.is_fraction(self.text_tokens[idx - 1]): number = self.text_tokens[idx - 1] else: number = self.convert_string_to_float(self.text_tokens[idx - 1]) if number is not None: if (self.text_tokens[idx - 1].lower() + "%").lower() not in self.stop_words_dict: tokenized_list.append(self.text_tokens[idx - 1].lower() + "%") elif dash_idx != -1: left = self.text_tokens[idx - 1][:dash_idx] right = self.text_tokens[idx - 1][dash_idx + 1:] if left.isnumeric() and right.isnumeric() and ( self.text_tokens[idx - 1].lower() + "%") not in self.stop_words_dict: tokenized_list.append(self.text_tokens[idx - 1].lower() + "%") def handle_number(self, tokenized_list, idx, token): """ converts all numbers to single format: 2 -> 2 68,800 -> 68.8K 123,456,678 -> 123.456M 3.5 Billion -> 3.5B :param tokenized_list: list of tokenized words :param idx: index of % in text_tokens :param token: text_tokens[idx] :return: """ number = self.convert_string_to_float(token) if number is None: tokenized_list.append(token.lower()) return multiplier = 1 if len(self.text_tokens) > idx + 1: if self.text_tokens[idx + 1] in ["%", "percent", "percentage"]: return if self.text_tokens[idx + 1].lower() in [ "thousand", "million", "billion" ]: if self.text_tokens[idx + 1].lower() == "thousand": multiplier = 1000 elif self.text_tokens[idx + 1].lower() == "million": multiplier = 1000000 elif self.text_tokens[idx + 1].lower() == "billion": multiplier = 1000000000 self.text_tokens[idx + 1] = '' number = number * multiplier kmb = "" if number >= 1000000000: number /= 1000000000 kmb = 'B' elif number >= 1000000: number /= 1000000 kmb = 'M' elif number >= 1000: number /= 1000 kmb = 'K' # if number is not an integer, separates it to integer and fraction # and keeps at most the first three digits in the fraction if "." in str(number): dot_index = str(number).index(".") integer = str(number)[:dot_index] fraction = str(number)[dot_index:dot_index + 4] if fraction == ".0": number = integer else: number = integer + fraction else: number = str(number) tokenized_list.append(number + kmb) def convert_string_to_float(self, s): """ tries to convert a string to a float if succeeds, returns float if fails, returns None :param s: string to convert :return: float / None """ if "," in s: s = s.replace(",", "") try: number = float(s) return number except: return None def split_url(self, url): """ separates a URL string to its components ex: url = https://www.instagram.com/p/CD7fAPWs3WM/?igshid=o9kf0ugp1l8x output = [https, www.instagram.com, p, CD7fAPWs3WM, igshid, o9kf0ugp1l8x] :param url: url as string :return: list of sub strings """ if url is not None: r = re.split('[/://?=]', url) if 'twitter.com' in r or 't.co' in r: return [] if len(r) > 3 and 'www.' in r[3]: r[3] = r[3][4:] return [ x.lower() for x in r if (x != '' and x != 'https' and not x.startswith('#')) ] def expand_tokenized_with_url_set(self, to_extend, urls_set): """ extends the to_extend list with the parsed values in url_set :param to_extend: list of strings to extend :param urls_set: a Set containing URL strings :return: """ for url in urls_set: to_extend.extend(self.split_url(url)) def take_emoji_off(self, token): return self.emoji_pattern.sub(r'', token) def json_convert_string_to_object(self, s): """ converts a given string to its corresponding object according to json used specifically to dictionaries :param s: string to convert :return: Object / None """ if s is None or s == '{}': return None else: try: return json.loads(s) except: return None def clean_text_from_urls(self, text): """ removes all URLs from text :param text: string :return: string without urls """ text = re.sub(r'http\S+|www.\S+', '', text) return text def handle_dashes(self, tokenized_list, token): """ Adds token's words separated to the tokenized list. e.g: Word-word will be handled as [Word,word, Word-word] :param tokenized_list: list of tokenized words :param token: String to separate :return: None """ dash_idx = token.find('-') after_dash = token[dash_idx + 1:].lower() if dash_idx > 0: tokenized_list.append(token.lower()) before_dash = token[:dash_idx].lower() if before_dash not in self.stop_words_dict: tokenized_list.append(before_dash) if after_dash not in self.stop_words_dict: tokenized_list.append(after_dash) else: if after_dash not in self.stop_words_dict: tokenized_list.append(after_dash) def handle_fraction(self, tokenized_list, token, idx): """ takes care of strings representing fractions if there is a number before the fraction, it concats both tokens into one. :param tokenized_list: list of tokenized words :param token: single word that would be handled :param idx: the index of the word in text_tokens :return: """ slash_idx = token.find('\\') if slash_idx != -1: token = token[:slash_idx] + '/' + token[slash_idx + 1:] frac = str(Fraction(token)) if idx == 0 and frac != token and frac.lower( ) not in self.stop_words_dict: tokenized_list.append(frac.lower()) else: number = self.convert_string_to_float(self.text_tokens[idx - 1]) if number is not None: if (self.text_tokens[idx - 1] + " " + token).lower() not in self.stop_words_dict: tokenized_list.append( (self.text_tokens[idx - 1] + " " + token).lower()) self.text_tokens[idx] = '' elif token != frac: if frac.lower() not in self.stop_words_dict: tokenized_list.append(frac.lower()) if token.lower() not in self.stop_words_dict: tokenized_list.append(token.lower()) else: if token.lower() not in self.stop_words_dict: tokenized_list.append(token.lower()) def is_fraction(self, token): """ checks whether given token is a fraction. :param token: string :return: boolean """ return re.match(self.RIGHT_SLASH_PATTERN, token) is not None or \ re.match(self.LEFT_SLASH_PATTERN, token) is not None def handle_dates(self, tweet_date): """ takes tweet's date and parsing it's information into tokenized_list :param tweet_date: date in string :return: list of parsed information """ splitted_date = tweet_date.split() day_num = splitted_date[2] month_txt, month_num = self.MONTHS_DICT[splitted_date[1]] date_num = day_num + "/" + month_num + "/" + splitted_date[5] return [month_txt, date_num, splitted_date[3]] def append_to_tokenized(self, tokenized_list, capital_letters, token): """ appends given token to tokenized list and to capital_letters dictionary according to it's first letter. :param tokenized_list: list of tokenized words :param capital_letters: dictionary containing words and boolean value. :param token: given word. :return: """ if len(token) > 0 and token[0].isupper(): if token not in capital_letters: capital_letters[token.lower()] = True else: capital_letters[token.lower()] = False if token.lower() not in self.stop_words_dict: c1 = token[0] if (ord(c1) < 48 or 57 < ord(c1) < 65 or 90 < ord(c1) < 97 or 122 < ord(c1)) and c1 != '#': return elif len(token) == 1 and 48 <= ord(c1) <= 57: return tokenized_list.append(token.lower())
class Parse: def __init__(self, with_stemmer=False, include_urls=False, include_quote=False, debug=False, timer=False): self.stemmer = Stemmer() self.with_stemmer = with_stemmer self.include_urls = include_urls self.include_quote = include_quote self.stop_words = stopwords.words('english') self.stop_words += ["i'm", "it's", 'they', "i've", 'you', 'u', 'we', 'rt', 'im', 'use', 'sure', ] self.debug = debug self.timer = timer self.times = [] def _is_number(self, number): return number.replace(',', '').replace('.', '', 1).replace('%', '', 1).replace('$', '', 1).replace('K', '', 1) \ .replace('M', '', 1).replace('B', '', 1).isdigit() def _pre_parse(self, text): text = ' '.join([w for w in text.split(' ') if '…' not in w]) whitespace = ' \t\n\r\v\f' ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz' ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' digits = '0123456789' # punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""" punctuation = r"""!#$%&'*’+,-./<=>?@[\]^_{|}~""" printable = digits + ascii_lowercase + ascii_uppercase + punctuation + whitespace text = ''.join([x for x in text if x in printable]) text = text.replace('\n', ' ') # remove new lines text = re.sub(' +', ' ', text) # Remove double spaces return text def _extract_entities(self, text): terms = [] entities_terms = [] subterm = '' for subtext in text.split(','): sub_terms = subtext.split(' ') for term in sub_terms: if not term.replace("'", '').replace('-', '').isalnum(): #Not a word if len(subterm.split(' ')) >= 2: entities_terms.append(subterm) subterm = '' elif term[0].upper() == term[0]: if subterm == '': subterm = term.replace('-', ' ') else: subterm += ' ' + term.replace('-', ' ') else: if len(subterm.split(' ')) >= 2: entities_terms.append(subterm) subterm = '' terms.append(term) entities_terms = [term for term in entities_terms if term != ''] return entities_terms, terms def _number_transform(self, term): opt_term = term.replace('%', '', 1).replace('$', '', 1).replace('K', '', 1) \ .replace('M', '', 1).replace('B', '', 1) replaced_term_optional = opt_term.replace(',', '') if not self._is_number(term.replace(',', '')): return term if float(replaced_term_optional) < 1000: number = round(float(replaced_term_optional), 3) if number == float(int(float(replaced_term_optional))): number = int(number) return term.replace(replaced_term_optional, str(number)) elif float(replaced_term_optional) < 1000000: if term.isdigit() and len(term) == 4 and int(term) > 1500 and int(term) < 2100: # Maybe an year return term else: number = round(float(replaced_term_optional) / 1000, 3) if number == float(float(replaced_term_optional) // 1000): number = int(number) return term.replace(opt_term, str(number) + 'K') elif float(replaced_term_optional) < 1000 * 1000 * 1000: number = round(float(replaced_term_optional) / 1000000, 3) if number == float(float(replaced_term_optional) // 1000000): number = int(number) return term.replace(opt_term, str(number) + 'M') elif float(replaced_term_optional) < 1000 * 1000 * 1000 * 1000: number = round(float(replaced_term_optional) / 1000000, 3) if number == float(float(replaced_term_optional) // 1000000): number = int(number) return term.replace(opt_term, str(number) + 'B') else: return term def _url_transform(self, url): parts = [] url_parts = url.split('/') parts.append(url_parts[0][:-1]) addr = url_parts[2] addr_parts = addr.split('.') addr_parts = [addr_parts[0]] + ['.'.join(addr_parts[1:])] if addr_parts[0] == 'www' else ['.'.join(addr_parts)] parts = parts + addr_parts + url_parts[3:-1] info = url_parts[-1].split('?') if len(info) == 1: parts = parts + info elif len(info) == 3: assert 1 == 0 else: parts.append(info[0]) props = info[1].split('&') for prop in props: parts = parts + prop.split('=') parts = [p for p in parts if p != ''] return parts def remove_comma(self, w): w = re.sub('[,]*$', '', w) w = re.sub('[.]*$', '', w) w = re.sub('^[,]*', '', w) w = re.sub('^[.]*', '', w) w = re.sub('[:]*$', '', w) w = re.sub('[-]+', ' ', w) w = re.sub('[’]+', "'", w) w = re.sub('[?]*$', '', w) w = re.sub('[!]*$', '', w) return w def _splitHashtags(self, term_): for i in range(len(term_) - 1)[::-1]: if term_[i].isupper() and term_[i + 1].islower(): term_ = term_[:i] + ' ' + term_[i:] if term_[i].isupper() and term_[i - 1].islower(): term_ = term_[:i] + ' ' + term_[i:] return term_.split() def _hashtags_tag_parse(self, tokens): result_tokens = [] rest_tokens = [] for w in tokens: if w[0] == '#': for subw in w[1:].split('_'): splited_hashtag = self._splitHashtags(subw) result_tokens += [sub_hashtag.lower() for sub_hashtag in splited_hashtag] result_tokens.append(w.replace('_', '').lower()) elif w[0] == '@': result_tokens.append(w) else: rest_tokens.append(w) return result_tokens, rest_tokens def _special_parse(self, tokens): parse_number_comma_tokens = [] for w in tokens: n_new_text_tokens = len(parse_number_comma_tokens) - 1 if (w.lower() == 'percent' or w.lower() == 'percentage') and len(parse_number_comma_tokens) != 0 and \ self._is_number(parse_number_comma_tokens[n_new_text_tokens]): parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + '%' elif (w.lower() == 'dollar' or w.lower() == 'dollars') and len(parse_number_comma_tokens) != 0 and \ self._is_number(parse_number_comma_tokens[n_new_text_tokens]): parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + '$' elif w.lower() == 'thousand' and len(parse_number_comma_tokens) != 0 and \ self._is_number(parse_number_comma_tokens[n_new_text_tokens]): parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + 'K' elif (w.lower() == 'million' or w.lower() == 'mill') and len(parse_number_comma_tokens) != 0 and \ self._is_number(parse_number_comma_tokens[n_new_text_tokens]): parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + 'M' elif w.lower() == 'billion' and len(parse_number_comma_tokens) != 0 and \ self._is_number(parse_number_comma_tokens[n_new_text_tokens]): parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + 'B' elif len(w.split('/')) == 2 and w.split('/')[0].isdigit() and len(parse_number_comma_tokens) != 0 and \ w.split('/')[1].isdigit() and self._is_number(parse_number_comma_tokens[n_new_text_tokens]): parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + ' ' + w else: parse_number_comma_tokens.append(w) return parse_number_comma_tokens def _remove_slashes(self, tokens): result_tokens = [] for token in tokens: if len(token.split('/')) == 1: result_tokens.append(token) continue splited = token.split('/') if len(splited) == 2 and splited[0].isdigit() and splited[1].isdigit(): result_tokens.append(token) else: result_tokens += splited return result_tokens def _apply(self, func, input): end_time, start_time = 0, 0 if self.timer: start_time = time.perf_counter() result = func(input) end_time = time.perf_counter() else: result = func(input) if self.debug: print(result) self.times.append(end_time - start_time) return result def parse_sentence(self, text): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ self.timer = True self.times = [] if self.debug: print('Text:', text) text = self._apply(self._pre_parse, text) entities, temp_text_tokens = self._apply(self._extract_entities, text) removed_urls_tokens = [w for w in temp_text_tokens if not w.startswith('https')] text_tokens = self._apply(self._remove_slashes, removed_urls_tokens) remove_comma_terms = [self.remove_comma(term) for term in text_tokens if self.remove_comma(term) != ''] entities_terms = [self.remove_comma(term) for term in entities if self.remove_comma(term) != ''] fix_numbers_terms = [self._number_transform(w) for w in remove_comma_terms] parse_number_comma_tokens = self._apply(self._special_parse, fix_numbers_terms) parse_number_comma_tokens = [w for w in parse_number_comma_tokens if w.lower() not in self.stop_words] tokens_parsed, rest_tokens = self._apply(self._hashtags_tag_parse, parse_number_comma_tokens) capital_tokens = [token.upper() for token in rest_tokens if token.lower() != token] rest_tokens = [token for token in rest_tokens if token.lower() == token] if self.with_stemmer: rest_tokens = [self.stemmer.stem_term(token) for token in rest_tokens] total_tokens = rest_tokens + entities_terms + tokens_parsed + capital_tokens if self.debug: print('Total tokens:', total_tokens) return total_tokens def _parse_urls(self, urls): urls = urls.replace('null', 'None') urls_tokens = [self._url_transform(w) for w in eval(urls).values() if w != '' and w is not None and 'twitter.com' not in w] urls_tokens = [item for sublist in urls_tokens for item in sublist] return urls_tokens def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] quote_text = doc_as_list[8] quote_url = doc_as_list[9] term_dict = {} #print(full_text) try: tokenized_text = self.parse_sentence(full_text) except: print(full_text) tokenized_text = [] # print(tokenized_text) # print('---------------------------------------------------------') if self.include_urls: tokenized_text += self._parse_urls(url) if self.include_quote and quote_text is not None: tokenized_text += self.parse_sentence(quote_text) if self.include_quote and self.include_urls and quote_url is not None: tokenized_text += self._parse_urls(quote_url) doc_length = len(tokenized_text) # after text operations. for term in tokenized_text: if term not in term_dict.keys(): term_dict[term] = 1 else: term_dict[term] += 1 document = Document(tweet_id, tweet_date, full_text, url, retweet_text=None, retweet_url=None, quote_text=quote_text, quote_url=quote_url, term_doc_dictionary=term_dict, doc_length=doc_length) return document
class Parse: num_of_docs = 0 total_doc_length = 0 retweet_dict = {} def __init__(self, config=None, advanced=False): # stopwords_to_add = ['rt'] self.english_word = words.words() self.stop_words = stopwords.words('english') puncs_to_add = ['...', '', '\'', '“', '”', '’', '…'] self.punctuators = [punc for punc in string.punctuation] + puncs_to_add self.tt = TweetTokenizer() self.stemmer = Stemmer() self.need_stemming = config.toStem if isinstance( config, ConfigClass) else False self.caps_dict = {} self.rules_dict = {} self.advanced = advanced def parse_sentence(self, text, urls={}): """ This function tokenize, remove stop words and apply lower case for every word within the text :param urls: :param text: :return: """ text_tokens = self.tt.tokenize(text) text_tokens_after_rules = [] # regEx patterns url_pattern = re.compile( r'^http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+' ) hashtag_pattern = re.compile(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)') mention_pattern = re.compile(r'(?:@[\w_]+)') numbers_pattern = re.compile(r'(?:(?:\d+,?)+(?:\.?\d+)?)') fractions_pattern = re.compile(r'(-?\d+)/(-?\d+)') emails_pattern = re.compile(r'\w\S*@.*\w') english_word_pattern = re.compile(r"[A-Za-z'-]+") for i, token in enumerate(text_tokens): if token.lower() in self.stop_words + self.punctuators: continue if self.advanced: if "-" in token: # split hyphen text_tokens_after_rules += token.replace("-", " ").split() if "/" in token: # split hyphen text_tokens_after_rules += token.replace("/", " ").split() if token.encode( 'ascii', 'ignore').decode('ascii') == '': # remove emoji continue if emails_pattern.match(token): # remove emails continue maybe_ent = '' if token[0].isupper(): maybe_ent += token text_tokens.remove(token) if len(text_tokens) > i: token = text_tokens[i] while token[0].isupper(): maybe_ent += ' ' + token text_tokens.remove(token) if len(text_tokens) > i: token = text_tokens[i] else: break if maybe_ent[0].isupper(): self.caps_dict[maybe_ent.lower()] = False self.check_capital(maybe_ent) if len(maybe_ent.split()) == 1: text_tokens_after_rules += [maybe_ent.lower()] else: text_tokens_after_rules += [maybe_ent.lower()] + [ tok.lower() for tok in maybe_ent.split() ] if token.lower() in self.stop_words + self.punctuators: continue if hashtag_pattern.match(token): text_tokens_after_rules += self.stemming_rule( self.hashtag_rule(token[1:])) elif url_pattern.match(token): # not use url if token in urls: url = urls[token] if url is not None: text_tokens_after_rules += self.URL_rule(url) continue elif mention_pattern.match(token): text_tokens_after_rules += self.stemming_rule([token]) elif numbers_pattern.match(token): if numbers_pattern.match(token).span() == (0, len(token)): if i + 1 < len(text_tokens): if text_tokens[i + 1].lower() in [ 'percent', 'percentage', '%' ]: per = text_tokens[i + 1] text_tokens_after_rules += [ self.numbers_rule(token)[0] + '%' ] text_tokens.remove(per) elif text_tokens[i + 1] in ['$', '¢', '£', '€']: sign = text_tokens[i + 1] text_tokens_after_rules += [ sign + self.numbers_rule(token)[0] ] text_tokens.remove(sign) elif text_tokens[i + 1].upper() in ['M', 'KM', 'CM', 'MM']: sign = text_tokens[i + 1] text_tokens_after_rules += [ self.numbers_rule(token)[0] + sign.upper() ] text_tokens.remove(sign) elif token.replace('.', '').replace(',', '').isdigit(): zeros_dict = { 'thousand': '0' * 3, 'million': '0' * 6, 'billion': '0' * 9 } multiplier = text_tokens[i + 1] if multiplier.lower() in zeros_dict.keys(): text_tokens_after_rules += self.numbers_rule( token + zeros_dict[multiplier.lower()]) text_tokens.remove(multiplier) elif fractions_pattern.match(text_tokens[i + 1]): frac = text_tokens[i + 1] text_tokens_after_rules += [ self.numbers_rule(token)[0] + f' {frac}' ] text_tokens.remove(frac) else: text_tokens_after_rules += self.numbers_rule( token) elif token[-1].upper() in ['K', 'M', 'B']: zeros_dict = { 'K': '0' * 3, 'M': '0' * 6, 'B': '0' * 9 } multiplier = token[-1] text_tokens_after_rules += self.numbers_rule( token[:-1] + zeros_dict[multiplier.upper()]) elif token[-2:].upper() in ['BN']: zeros_dict = {'BN': '0' * 9} multiplier = token[-2:] text_tokens_after_rules += self.numbers_rule( token[:-2] + zeros_dict[multiplier.upper()]) else: text_tokens_after_rules += self.numbers_rule(token) else: text_tokens_after_rules += self.stemming_rule([token]) else: text_tokens_after_rules += self.stemming_rule([token]) text_tokens_after_rules = [ w for w in text_tokens_after_rules if w not in self.stop_words ] return text_tokens_after_rules def hashtag_rule(self, text): if '_' in text: return text.lower().split('_') + [ '#' + text.lower().replace('_', '') ] else: splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', text)).split() return [s.lower() for s in splitted] + ['#' + text.lower()] def URL_rule(self, text): splitted = re.split("[, \-!?:=\n/…]+", text) splitted[1:1] = splitted[1].split('.', maxsplit=1) splitted.remove(splitted[3]) url_stopwords = self.stop_words + self.punctuators + [ 'http', 'www', 'https', 'com', 'co', 'twitter', 'status', 'web' ] without_stopwords = [s for s in splitted[:3] if s not in url_stopwords] return without_stopwords def numbers_rule(self, text): number_str = text.split()[0].replace(',', '') if '.' in number_str: number = float(number_str) else: number = int(number_str) if number < 10**3: return ["{:.3f}".format(number).strip('0').strip('.')] elif 10**3 <= number < 10**6: return [ "{:.3f}".format(number / 10**3).strip('0').strip('.') + 'K' ] elif 10**6 <= number < 10**9: return [ "{:.3f}".format(number / 10**6).strip('0').strip('.') + 'M' ] else: return [ "{:.3f}".format(number / 10**9).strip('0').strip('.') + 'B' ] def stemming_rule(self, tokens): if self.need_stemming: after_tokens = [] for token in tokens: after_tokens.append(self.stemmer.stem_term(token)) return after_tokens else: return tokens def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ self.num_of_docs += 1 tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] retweet_text = doc_as_list[4] retweet_url = doc_as_list[6] quote_text = doc_as_list[6] quote_url = doc_as_list[7] term_dict = {} urls = json.loads(url) tokenized_text = self.parse_sentence(full_text, urls) parsed_text = [ tok for tok in tokenized_text if tok not in self.stop_words + self.punctuators ] doc_length = len( parsed_text ) # after text operations. TODO: check if before parsing gives better results self.total_doc_length += doc_length if retweet_url is not None: # print(retweet_url) tid_ptrn = re.compile('\d{7,}') # for url in retweet_url.values(): s = tid_ptrn.search(retweet_url) if s is not None: tid = retweet_url[s.start():s.end()] if tid not in self.retweet_dict: self.retweet_dict[tid] = 1 else: self.retweet_dict[tid] += 1 for term in parsed_text: if term not in term_dict.keys(): if term[:1].isupper(): term_dict[term.upper()] = 1 else: term_dict[term.lower()] = 1 else: if term[:1].isupper(): term_dict[term.upper()] += 1 else: term_dict[term.lower()] += 1 for term in [key for key in term_dict.keys() if key.islower()]: if term.upper() in term_dict.keys(): term_dict[term] += term_dict.pop(term.upper()) # if self.num_of_docs % self.group_size == 0: # self.write_file() document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length) return document def check_capital(self, token): if len(token.split()) > 1: for word in token.split(): if word.lower() not in self.caps_dict.keys(): self.caps_dict[word.lower()] = word[0].islower() else: if word[0].islower(): self.caps_dict[word.lower()] = True
class Parse: def __init__(self, stemming=None): """ constructor for this class :param stemming: """ self.stop_words = stopwords.words('english') self.stemmer = None if stemming: self.stemmer = Stemmer() self.corona_list = [ "SARS", "sars", "Severe Acute Respiratory Syndrome", "severe acute respiratory syndrome", "SARS-CoV", "SARS CoV", "sars-cov", "sars cov", "coronavirus", "corona virus", "COVID", "covid", "Covid", "COVID-19", "covid-19", "#coronavirus", "COVID__19", "#COVID", "#COVID-19", "#covid19", "#SARS" ] def get_list_without_stopwords(self, list): """ :param list: :return: list without stopwords """ list_without_stopwords = [] stop_words = stopwords.words('english') for w in list: if not w.lower() in stop_words: list_without_stopwords.append(w) return list_without_stopwords def check_If_Upper_More_Then_Lower(self, text): """ This function check the ratio of lower and upper case in a string :param text: :return: true ro false """ if len(text) > 0: count = 0 i = 0 while i < len(text): if text[i].islower(): count = count + 1 i = i + 1 len1 = len(text) if len1 > 0: return count / len(text) < 0.5 else: return False def upperToLowerAfterDot(self, list, index, new_tokens): """ Convert word that appear after dot or : in text :param list: :param index: :param new_tokens: :return: """ if len(list) > index + 1: # term term . & if len(list) > index + 2: if list[index + 1].isalpha() and not list[index + 2].isupper(): new_tokens.append(list[index + 1].lower()) list[index + 1] = "" def Hashtags(self, list, index, new_tokens): """ This function get "@" and concat this term to the next term :param list: :param index: :param new_tokens: :return: """ if len(list) >= index + 1: word = list[index + 1] list[index + 1] = "" if "_" in word: words = word.rsplit("_") else: word = re.sub('([a-zA-Z])', lambda x: x.groups()[0].upper(), word, 1) words = re.findall('[A-Z][^A-Z]*', word) new_word = "" i = 0 while i < len(words): new_tokens.append(words[i].lower()) new_word = new_word + words[i].lower() i += 1 new_tokens.append("#" + new_word) def tags(self, list, index, new_tokens): """ This function separate the string on each time appear upper letter in the string to each time appears "_" to different terms :param list: :param index: :param new_tokens: :return: """ new_word = "@" + list[index + 1] new_tokens.append(new_word) new_tokens.append(list[index + 1].lower()) list[index + 1] = '' def extractUrl(self, list, index): """ Thos function separate the url to terms :param list: :param index: :return: """ word = list[index] tokenize_list_url = re.compile(r'[\:/?=\-&]+', re.UNICODE).split(word) if len(tokenize_list_url) > 1: url = tokenize_list_url[1] if 'www.' in url: url2 = url.replace('www.', '') tokenize_list_url.append(url2) list.extend(tokenize_list_url) def handel_percent(self, list, index, new_tokens): """ This function convert "percentage" or "percent" to % and concat the term which appears before the % :param list: :param index: :param new_tokens: :return: """ if not list[index - 1].isalpha(): num = list[index - 1] new_word = num + "%" if index - 1 < len(list): if list[index - 1] in new_tokens: new_tokens.remove(list[index - 1]) new_tokens.append(new_word) def convertNumbersUnits(self, list, index, new_tokens): """ This function convert the units of number :param list: :param index: :param new_tokens: :return: """ numeric_list = WordsToNumber().getNumericWords() if index + 1 < len(list) and list[index + 1].lower() in numeric_list: num = float(list[index]) numericNum = float(WordsToNumber().execute(list[index + 1])) new_Num = str(num * numericNum) new_word = WordsToNumber().handle_number(new_Num) list[index] = '' list[index + 1] = '' new_tokens.append(new_word) elif float(list[index]) >= 1000: new_word = WordsToNumber().handle_number(str(list[index])) list[index] = '' new_tokens.append(new_word) elif self.isFraction(list, index + 1): if "." not in list[index]: new_word = list[index] + " " + list[index + 1] list[index + 1] = '' else: new_word = list[index] new_tokens.append(new_word) else: new_tokens.append(list[index]) def combainCapitalTerms(self, text_tokens): """ This function concat two or more term which appears with capital letters one after one :param text_tokens: :return: """ for index, word in enumerate(text_tokens): if len(word) > 0: if word[0].isupper(): try: list_ca = self.capitalettersTerms(text_tokens, index) text_tokens = text_tokens + list_ca except: print("Could not connect terms") if index == 3: break return text_tokens def capitalettersTerms(self, list, index): result = [] i = 0 word = list[index] if word[0].isupper(): new_word = word i = index if i + 1 < len(list): i = i + 1 loop = 1 while list[i][0].isupper() and index + 1 == i and loop > 5: loop += 1 new_word = new_word + " " + list[i] index += 1 if i + 1 < len(list): i += 1 if not new_word in list: result.insert(index, new_word) else: if list[index][0].isupper() and not new_word in list: result.insert(index, list[index]) else: i += 1 return result def remove_emoji(self, string): """ This function remove emoji from text :param string: :return: """ emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', string) def isFraction(self, list, index): """ This function checke whether the word is a fraction or not :param list: :param index: :return: """ word = list[index] if "/" in word: word = word.replace("/", "") if word.isnumeric(): return True else: return False elif "." in word: word = word.replace(".", "") if word.isnumeric(): return True else: return False def isNumber(self, list, index): """ This function checke whether the word is a number or not :param list: :param index: :return: """ word = list[index] if "," in word: word = word.replace(",", "") if word.isnumeric(): list[index] = word return True else: return False elif "." in word and word.count(".") == 1: word = word.replace(".", "") if word.isnumeric(): return True else: return str(list[index]).isnumeric() def handle_dashes(self, list, index, new_tokens): """ This function separate the term by "-" :param list: :param index: :param new_tokens: :return: """ dash_idx = list[index].find('-') if self.stemmer is None: new_tokens.append(list[index].lower()) new_tokens.append(list[index][:dash_idx].lower()) new_tokens.append(list[index][dash_idx + 1:].lower()) else: new_tokens.append(self.stemmer.stem_term(list[index].lower())) new_tokens.append( self.stemmer.stem_term(list[index][:dash_idx].lower())) new_tokens.append( self.stemmer.stem_term(list[index][dash_idx + 1:].lower())) if list[index] in self.corona_list: new_tokens.append("corona") def parse_sentence(self, text): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ new_capital_words = set() temp_entitie = '' will_merge = 0 capital_dict = {} entities_set = set() #text = self.remove_emoji(text) **************************************** #if self.cheak_If_Upper_More_Then_Lower(text): ************************ # text = text.lower() ****************************** text_tokens = word_tokenize(text) try: url = "" if "http" in text: url = re.search("(?P<url>https?://[^\s]+)", text).group("url") if len(url) > 0: text = text.replace(url, "") text_tokens = word_tokenize(text) except: pass #text_tokens = self.get_list_without_stopwords(text_tokens) ******************************* new_tokens = [] # text_tokens_without_stopwords = [w.lower() for w in text_tokens if w not in self.stop_words] for index, word in enumerate(text_tokens): if word == "" or word == " " or word.lower( ) in self.stop_words or word.lower().endswith("'s") or ( len(word) == 1 and ord(word)) > 126: continue # ------------------------------------------------------------------------ upper to lower elif word == "." or word == ":": self.upperToLowerAfterDot(text_tokens, index, new_tokens) # -------------------------------------------------------------------------- HashTAG elif word == "#" and index <= len(text_tokens) - 2: self.Hashtags(text_tokens, index, new_tokens) # ---------------------------------------------------------------------------- Tags elif word == "@" and index <= len(text_tokens) - 2: self.tags(text_tokens, index, new_tokens) # ------------------------------------------------------------------------ percent % elif word == "percent" or word == "percentage" or word == '%': self.handel_percent(text_tokens, index, new_tokens) # -------------------------------------------------------------------------- Dollars $ "the number is 80 $ and nata $" elif word == "$": new_tokens.append("dollars") # ------------------------------------------------------------------------- 3 miliom ex elif not word.isalpha(): if self.isNumber(text_tokens, index) or word.isnumeric(): try: self.convertNumbersUnits(text_tokens, index, new_tokens) except: pass # ---------------------------------------------------------------- split the word by the dashes elif '-' in word and len(word) > 1: self.handle_dashes(text_tokens, index, new_tokens) # ------------------------------------------------------------- elif word in self.corona_list: new_tokens.extend([word, "corona"]) # ------------------------------------------------- Otherwise, if it's just a normal word add it elif word.isalpha() or word.isnumeric(): if self.stemmer is not None: word = self.stemmer.stem_term(word) new_tokens.append(word) # ------------------------------------------------- chaning two or more upper words to one term if len(word) > 0 and word[0].isupper(): # chunks entities together. temp_entitie += word + " " will_merge += 1 else: # add entity to the global counter and to the current words set if temp_entitie != '': n = temp_entitie[: -1] # delete the space " " apter the capital term entities_set.add(n) if will_merge > 1: new_capital_words.add(temp_entitie) temp_entitie = '' will_merge = 0 if len(word) > 0 and word[0].isupper(): if word not in capital_dict: capital_dict[word.lower()] = True else: capital_dict[word.lower()] = False if len(url) > 0: list = [] list.append(url) self.extractUrl(list, 0) new_tokens.extend(list) return new_tokens, capital_dict, entities_set def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] retweet_text = doc_as_list[5] retweet_url = doc_as_list[6] quote_text = doc_as_list[8] quote_url = doc_as_list[9] retweet_quoted_text = doc_as_list[11] if quote_text is not None: full_text = full_text + " " + quote_text if retweet_quoted_text is not None: full_text = full_text + " " + retweet_quoted_text #if retweet_text is not None: # full_text = full_text + " " + retweet_text # clean latin letters full_text = re.sub( re.compile( pattern= r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]' ), u'', full_text) term_dict = {} tokenized_text, capital_dict, entities_set = self.parse_sentence( full_text) doc_length = len(tokenized_text) # after text operations. max_tf = 0 for idx, term in enumerate(tokenized_text): if term not in term_dict.keys(): term_dict[term] = [idx] else: term_dict[term].append(idx) max_tf = max(len(term_dict[term]), max_tf) unique_terms_in_doc = len(term_dict) are_rt = 0 if full_text.find("rt") == 0 or full_text.find("RT") == 0: are_rt = 1 document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length, max_tf, unique_terms_in_doc, are_rt, capital_dict, entities_set) return document
def parser_rules(self, token_text, stemming=False): rmv = [] add = [] url_stop = [ "/", "\\", "-", "=", '%', "'", " ", ":", "`", '``', '_', '"', "...", '``', "''", "www." ] delimiters = '|'.join(map(re.escape, url_stop)) all_delimiters = '|'.join(map(re.escape, url_stop + ["."])) nameOrEntity = "" counterOfCapitalInARow = 0 for i, token in enumerate(token_text): if token in self.stop_words or token.lower( ) in self.stop_words or token in url_stop: rmv.append(token) continue # Check for unwanted chars like : . ; , / etc if len(token) == 1 and token not in ["@", "#", "$", "%"]: # if ord(token_text[i]) > 122 or 90 < ord(token_text[i]) < 97 or 57 < ord(token_text[i]) < 64 or 37 < ord(token_text[i]) < 48 or 31 < ord(token_text[i]) < 35: rmv.append(token_text[i]) continue # Remove unwanted expressions if token.__contains__("t.co") or token.__contains__( "http") or token.lower() == "rt" or token.__contains__( "twitter.com"): rmv.append(token) continue # url detector if token.__contains__("//"): token_url = [ t for t in re.split(delimiters, token) if (len(t) > 1) ] rmv.append(token) add += token_url continue # Check if it is a tag if token_text[i] == "@" and i < len(token_text) - 1: token_text[i] = token_text[i] + token_text[i + 1] rmv.append(token_text[i + 1]) continue # Check if it is a hashtag and analyze the hashtag to words according to Upper letters if token_text[i] == "#" and i < len(token_text) - 1: token_text[i] = token_text[i] + token_text[i + 1] rmv.append(token_text[i + 1]) add = self.word_cutter(add, url_stop, token_text[i + 1]) continue # Turn every context of dollars to the word dollar if token.lower() in ["$", "dollars"]: token_text[i] = "dollar" continue # Turn every context of percentage to % if self.is_real_number(token_text[i]) and i < len(token_text) - 1: if token_text[i + 1].lower() in ["%", "percentage", "percent"]: token_text[i] = token_text[i] + "%" rmv.append(token_text[i + 1]) continue # Names and Entities - will be 2 or 3 tokens if token_text[i][0].isupper( ) and counterOfCapitalInARow < 3 and not token_text[i].isnumeric(): nameOrEntity = nameOrEntity + " " + token_text[i] # delete space in the beginning if counterOfCapitalInARow == 0: nameOrEntity = nameOrEntity[1:len(nameOrEntity)] counterOfCapitalInARow += 1 elif 1 < counterOfCapitalInARow < 4: # add to the right set - number of times that the entity exists so far add.append(nameOrEntity.upper()) nameOrEntity = "" counterOfCapitalInARow = 0 else: nameOrEntity = "" counterOfCapitalInARow = 0 # Check if it is a big number if self.is_real_number_comma(token_text[i]): try: # Convert to float and int convertedNumToFloat = float(token_text[i].replace(',', '')) convertedToInt = int(convertedNumToFloat) # The final number if convertedToInt == convertedNumToFloat: finalNumber = convertedToInt else: finalNumber = convertedNumToFloat # Check if the next token is thousand, million, billion or fraction if finalNumber < 1000: if i < len(token_text) - 1 and token_text[i + 1] in [ "Thousand", "thousand", "Thousands", "thousands" ]: convertedToString = str(finalNumber) + "K" elif i < len(token_text) - 1 and token_text[i + 1] in [ "Million", "million", "Millions", "millions" ]: convertedToString = str(finalNumber) + "M" elif i < len(token_text) - 1 and token_text[i + 1] in [ "Billion", "billion", "Billions", "billions" ]: convertedToString = str(finalNumber) + "B" # if the next token is fraction then connect them elif i + 1 < len(token_text) and self.is_fraction( token_text[i + 1]): convertedToString = token_text[ i] + " " + token_text[i + 1] else: continue # Add to lists add.append(convertedToString) rmv.append(token_text[i]) rmv.append(token_text[i + 1]) # if it is a thousand number elif 999 < convertedToInt < 999999: finalNumber /= 1000 # After division need to save again 1 or 1.0 for example convertedNumToFloat = float(finalNumber) convertedToInt = int(convertedNumToFloat) if convertedToInt == convertedNumToFloat: finalNumber = convertedToInt else: finalNumber = convertedNumToFloat finalNumber = self.round_down(finalNumber) convertedToString = str(finalNumber) + "K" # Add to lists add.append(convertedToString) rmv.append(token_text[i]) # if it is a Million number elif 999999 < convertedToInt <= 999999999: finalNumber /= 1000000 # After division need to save again 1 or 1.0 for example convertedNumToFloat = float(finalNumber) convertedToInt = int(convertedNumToFloat) if convertedToInt == convertedNumToFloat: finalNumber = convertedToInt else: finalNumber = convertedNumToFloat finalNumber = self.round_down(finalNumber) convertedToString = str(finalNumber) + "M" # Add to lists add.append(convertedToString) rmv.append(token_text[i]) # if it is a Billion number elif 9999999 < convertedToInt: finalNumber /= 1000000000 # After division need to save again 1 or 1.0 for example convertedNumToFloat = float(finalNumber) convertedToInt = int(convertedNumToFloat) if convertedToInt == convertedNumToFloat: finalNumber = convertedToInt else: finalNumber = convertedNumToFloat finalNumber = self.round_down(finalNumber) convertedToString = str(finalNumber) + "B" # Add to lists add.append(convertedToString) rmv.append(token_text[i]) except: continue # Split words that will mean something after splitting if any(one_char in url_stop + ["."] for one_char in token): # print(token_text[i]) token_url = [ t for t in re.split(all_delimiters, token) if (len(t) > 1) ] rmv.append(token) add += token_url continue for w in rmv: if w in token_text: token_text.remove(w) for w2 in add: if w2 == "" or w2 in url_stop: continue token_text.append(w2) # Stem if asked if stemming: s = Stemmer() for i, token in enumerate(token_text): if self.first_alfa_upper(token): token_text[i] = s.stem_term(token).upper() else: token_text[i] = s.stem_term(token) return token_text
class Parse: def __init__(self, stemming=0): """ This function initiate the fields of Parse, init the stemmer and entering stop words :param stemming: the boolean value is stem is needed (optional) """ self.stemming = stemming self.stemmer = Stemmer() # self.stop_words = frozenset(stopwords.words('english')) ?????????????????????????????????????????????????????? self.stop_words = stopwords.words('english') self.stop_words.extend([ ':', '\'s', '.', ',', ';', '’', '?', '!', 'rt', '-', '|', '~', '(', ')', '*', '+', '=' '/', '"', '``', '\'\'', '\n', '\n\n', '&', 'amp', '…', '\'', '`', '[', ']', '{', '}' ]) def find_url(self, text): """ This function finds the url addresses in the text (with valid conditions for urls in string) :param text: the full text of the tweet :return: list of all urls in the text """ text = re.sub(r'[^\x00-\x7F]+', '', text) urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) return urls def tokenize_urls(self, urls): """ This function tokenize the url addresses in the text :param urls: the list of urls in the text :return: list of tokenized words from all urls """ url_tokens = [] tokens_ans = [] for url in urls: if 't.co' not in url: url_tokens.extend(re.split(r';|-|/|//|:|=|\?', url)) for token in url_tokens: if token == 'https' or token == 'http' or token == '': continue elif 'www.' in token: tokens_ans.append(token.replace('www.', '')) # remove garbage (like aH3cdR5ouY) elif token.islower() or token.isdigit(): tokens_ans.append(token) return tokens_ans def number_3digits(self, number): """ This function change the format of the number to 3 digits after the point :param number: the number is int/float :return: the number in the format (with 3 digits after the point) """ return "{:.4g}".format(number) def number_size(self, w, i, text_tokens_list): """ This function checks if the word is number between some range :param w: the word is string :param i: the index is integer :param text_tokens_list: this is a list of tokens :return: the word in the range """ number = int(w) if w.isdigit() else float(w) # Number is in thousand range if 1000 <= number < 1000000: number = number / 1000 w = self.number_3digits(number) + "K" # Number is in million range elif 1000000 <= number < 1000000000: number = number / 1000000 w = self.number_3digits(number) + "M" # Number is in billion range or more elif 1000000000 <= number: number = number / 1000000000 w = self.number_3digits(number) + "B" # Number is in hundred range or less else: w = self.number_3digits(number) # Thousand if i + 1 < len(text_tokens_list) and ( text_tokens_list[i + 1] == 'Thousand' or text_tokens_list[i + 1] == 'thousand'): text_tokens_list[i + 1] = 'K' text_tokens_list[i:(i + 2)] = [ ''.join(text_tokens_list[i:(i + 2)]) ] w = text_tokens_list[i] # Million elif i + 1 < len(text_tokens_list) and ( text_tokens_list[i + 1] == 'Million' or text_tokens_list[i + 1] == 'million'): text_tokens_list[i + 1] = 'M' text_tokens_list[i:(i + 2)] = [ ''.join(text_tokens_list[i:(i + 2)]) ] w = text_tokens_list[i] # Billion elif i + 1 < len(text_tokens_list) and ( text_tokens_list[i + 1] == 'Billion' or text_tokens_list[i + 1] == 'billion'): text_tokens_list[i + 1] = 'B' text_tokens_list[i:(i + 2)] = [ ''.join(text_tokens_list[i:(i + 2)]) ] w = text_tokens_list[i] # Fraction after the number elif i + 1 < len(text_tokens_list) and bool( re.search(r'^-?[0-9]+\/[0-9]+$', text_tokens_list[i + 1])): text_tokens_list[i:(i + 2)] = [ ' '.join(text_tokens_list[i:(i + 2)]) ] w = text_tokens_list[i] return w def get_entity(self, text): """ This function finds the entities in the text (two or more words in sequence that starts with capital letter) :param text: the full text of the tweet :return: list of all entities in the text """ entities = re.findall( r'^[A-Z][a-z]+(?: [A-Z][a-z]+)+| [A-Z][a-z]+(?: [A-Z][a-z]+)+', text) for i, entity in enumerate(entities): entities[i] = entity.upper() if entity[0] == ' ': entities[i] = entities[i][1:] return entities def parse_sentence(self, text): """ This function tokenize, remove stop words and apply parser rules for every word within the text :param text: the full text of the tweet :return: list of tokenized words """ full_text = text # Extract the urls from the text and tokenize them separately urls = self.find_url(text) tokenized_urls = [] if len(urls) != 0: tokenized_urls = self.tokenize_urls(urls) for url in urls: text = text.replace(url, '') # Tokenize the text- remove all characters that not ascii, # then split the words in the text by punctuation marks, # and finally clear all white spaces text = re.sub(r'[^\x00-\x7F]+', ',', text) text_tokens = re.split( r'([^a-zA-Z0-9_]|[0-9]*/[0-9]*|[0-9]*,[0-9]*,[0-9]*,[0-9]*|[0-9]*,[0-9]*,[0-9]*|[0-9]*,[0-9]*)', text) # \W text_tokens = list(filter((' ').__ne__, text_tokens)) text_tokens = list(filter(('').__ne__, text_tokens)) # Loops on the tokens list i = 0 while i < len(text_tokens): w = text_tokens[i] # Check if the is stop word- delete her if (w.lower() in self.stop_words) or (w in self.stop_words): del text_tokens[i] continue else: # Find parser rules # (Upper case) - if first letter is capital -> all word is uppercase if len(w) > 1 and w[0].isupper(): text_tokens[i] = w = w.upper() # (@) - if the word is @ and after there is a word -> union those tokens elif w == '@' and i < (len(text_tokens) - 1): text_tokens[i:(i + 2)] = [''.join(text_tokens[i:(i + 2)])] # (#) - if the word is # and after there is a word -> union those tokens (there are more rules here) elif w == '#' and i < (len(text_tokens) - 1) and ( text_tokens[i + 1] == ',' or text_tokens[i + 1] == '#'): del text_tokens[i] del text_tokens[i] continue elif w == '#' and i < (len(text_tokens) - 1) and text_tokens[i + 1] != ',': hashword = text_tokens[i + 1] text_tokens[i:(i + 2)] = [ ''.join(text_tokens[i:(i + 2)]).lower().replace( '_', '') ] separate = hashword.split('_') # in case the words are not separated by _ (like: #home) if len(separate) == 1: # in case the hashtag is all lower case if separate[0].islower(): text_tokens.insert(i, hashword) continue separate = re.findall('[A-Z][^A-Z]*', separate[0]) # new rule: hashtag with sequenced capital letter will be merged to one term (like: #WhereIsKCR) for index, word in enumerate(separate): if len(word) == 1 and word.isupper(): j = index + 1 while j < len(separate) and len(separate[j]) == 1: j += 1 separate[index:(j + 1)] = [ ''.join(separate[index:(j + 1)]) ] # Add the separated words from the hashtag to the tokens list for word in reversed(separate): if len(word) > 0: text_tokens.insert(i, word.lower()) # Numbers elif w.isdigit() or w.replace(',', '').isdigit(): # Remove , text_tokens[i] = w = w.replace(',', '') # . if (i + 1) < len(text_tokens) and text_tokens[ i + 1] == '.' and (i + 2) < len( text_tokens) and text_tokens[i + 2].isdigit(): text_tokens[i:(i + 3)] = [''.join(text_tokens[i:(i + 3)])] w = text_tokens[i] # Number% if (i + 1) < len(text_tokens) and text_tokens[i + 1] == '%': text_tokens[i] = self.number_3digits( float(text_tokens[i])) text_tokens[i:(i + 2)] = [''.join(text_tokens[i:(i + 2)])] i += 1 continue # Number percent/percentage -> Number% elif (i + 1) < len(text_tokens) and \ (text_tokens[i + 1] == 'percent' or text_tokens[i + 1] == 'percentage'): text_tokens[i] = self.number_3digits( float(text_tokens[i])) text_tokens[i + 1] = '%' text_tokens[i:(i + 2)] = [''.join(text_tokens[i:(i + 2)])] i += 1 continue # Other numbers- check ranges text_tokens[i] = w = self.number_size(w, i, text_tokens) # new rule: $Number will be merged to one term if i > 0 and text_tokens[i - 1] == '$': text_tokens[(i - 1):(i + 1)] = [ ''.join(text_tokens[(i - 1):(i + 1)]) ] continue i += 1 # stem terms if needed if self.stemming: for j, term in enumerate(text_tokens): if text_tokens[j][0] != '#' and text_tokens[j][0] != '@': text_tokens[j] = self.stemmer.stem_term(term) text_tokens += tokenized_urls return text_tokens # cant change the function signature def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-presenting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] retweet_text = doc_as_list[5] retweet_url = doc_as_list[6] quote_text = doc_as_list[8] quote_url = doc_as_list[9] entity_list = dict(Counter(self.get_entity(full_text))) # Change the short url in text to the full url (if exist in url dictionary), and send to parse_sentence j = json.loads(url) text = full_text for short in j: if j[short] is not None: text = text.replace(short, j[short]) tokenized_text = self.parse_sentence(text) tokenized_text = list(filter(('').__ne__, tokenized_text)) doc_length = len(tokenized_text) # after text operations. term_dict = dict(Counter(tokenized_text)) document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length, entity_list) return document def parse_query(self, query): list_tokens = self.get_entity(query) list_tokens += self.parse_sentence(query) dict_tokens = dict(Counter(list_tokens)) return dict_tokens
class Parse: def __init__(self, stemming): self.stop_words = stopwords.words('english') self.stop_words += ["rt", "http", "https", "www", "twitter.com"] # TODO: check & self.terms = set() self.nonstopwords = 0 self.max_tf = 0 self.toStem = stemming self.entities = {} if self.toStem: self.stemmer = Stemmer() def parse_sentence(self, text): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ term_dict = {} entity_dict = {} # Entity recognition by capital letters (2 words or more) for entity in re.findall(ENTITY_PATTERN, text): cleaned_entity = re.sub("-", " ", entity).upper() entity_dict[cleaned_entity] = entity_dict.get(cleaned_entity, 0) + 1 text_tokens = re.findall(TOKENIZER_PATTERN, text) indices_counter = 0 for term in text_tokens: if len(term) < 1: continue indices_counter += 1 if term[0] == "#": # handle hashtags hashtag_list = self.hashtag_parser(term) for mini_term in hashtag_list: self.dictAppender(term_dict, indices_counter, mini_term) elif term[0] == "@": # handle tags no_tag = self.tags_parser(term) self.dictAppender(term_dict, indices_counter, no_tag) elif term in contractions: # remove things like he'll new_terms = contractions[term].split(" ") for mini_term in new_terms: self.dictAppender(term_dict, indices_counter, mini_term) indices_counter += 1 indices_counter -= 1 continue self.dictAppender(term_dict, indices_counter, term) return term_dict, indices_counter, entity_dict def split_url(self, url): url_list = list(filter(None, re.split(SPLIT_URL_PATTERN, url))) return url_list def remove_percent_dollar(self, text): no_dollar = re.sub(DOLLAR_PATTERN, "$", text) return re.sub(PERCENT_PATTERN, "%", no_dollar) def num_manipulation(self, num): num = re.sub(BILLION_PATTERN, "B", num) num = re.sub(MILLION_PATTERN, "M", num) num = re.sub(THOUSAND_PATTERN, "K", num) num = re.sub(BILLION_PATTERN_NUM, r'\1.\3B', num) num = re.sub(MILLION_PATTERN_NUM, r'\1.\3M', num) num = re.sub(THOUSAND_PATTERN_NUM, r'\1.\3K', num) num = re.sub(GENERAL_PATTERN, r'\1.\2\3\5', num) return re.sub(DECIMAL_PATTERN, r'\1\3', num) def url_parser(self, url): """ :param url: recieves a string based dictionary of all urls :return: dictionary with parsed urls """ if len(url) <= 2: #url list is not empty return [] url_list = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', url[1:-1]) finalList = [] for val in url_list: if 'twitter.com/i/web/status/' in val or 't.co' in val: continue val = re.sub(TWITTER_STATUS_PATTERN, r'\2', val) finalList = self.split_url(val) return finalList def hashtag_parser(self, hashtag): splitted_hashtag = list( map( lambda x: x.lower(), filter(lambda x: len(x) > 0, re.split(HASHTAG_PATTERN, hashtag)))) if len(splitted_hashtag) < 2: return splitted_hashtag else: return splitted_hashtag[1:] + [hashtag.lower()] def tags_parser(self, tag): return tag[1:] def dictAppender(self, d, counter, term): # Handling Stemming if self.toStem: stemmed_word = self.stemmer.stem_term(term) if not term.islower(): term = stemmed_word.upper() else: term = stemmed_word # Handling upper & lower cases per document term_lower = term.lower() if not all(ord(c) < 128 for c in term): return if term_lower in self.stop_words: return term_upper = term.upper() if not term.islower(): # upper term = term_upper if term_lower in self.terms: term = term_lower elif term_upper in self.terms: # lower self.terms.remove(term_upper) upper_list = d[term_upper] d.pop(term_upper) d[term_lower] = upper_list self.terms.add(term) # Creating indices list self.nonstopwords += 1 tmp_lst = d.get(term, []) tmp_lst.append(counter) d[term] = tmp_lst if self.max_tf < len(tmp_lst): self.max_tf = len(tmp_lst) def parse_doc(self, doc_as_list): # Do NOT change signature """doc_as_list[3] This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ # Get relevant information from tweet tweet_id = doc_as_list[0] full_text = doc_as_list[2] docText = full_text url = doc_as_list[3] quote_text = doc_as_list[8] if quote_text: docText += quote_text self.nonstopwords = 0 self.max_tf = 0 docText = re.sub(REMOVE_URL_PATTERN, "", docText) # link (urls) removal from fulltext docText = self.num_manipulation(docText) docText = self.remove_percent_dollar(docText) tokenized_dict, indices_counter, entity_dict = self.parse_sentence( docText) urlTermList = self.url_parser(url) for term in urlTermList: indices_counter += 1 self.dictAppender(tokenized_dict, indices_counter, term) doc_length = self.nonstopwords # after text operations. document = Document(tweet_id, term_doc_dictionary=tokenized_dict, doc_length=doc_length, max_tf=self.max_tf, entities_dict=entity_dict) return document def parse_query(self, query): # return {term: ([indices,tf])} self.nonstopwords = 0 self.max_tf = 0 docText = self.num_manipulation(query) docText = self.remove_percent_dollar(docText) tokenized_dict, indices_counter, entity_dict = self.parse_sentence( docText) return tokenized_dict, entity_dict def remove_stopwords(self, query): text_tokens = re.findall(TOKENIZER_PATTERN, query) tokens = list( filter(lambda x: x.lower() not in self.stop_words, text_tokens)) query = ' '.join(tokens) return query
class Parse: THOUSAND = 1000 MILLION = 1000000 BILLION = 1000000000 TRILLION = 1000000000000 QUANTITIES = { 'thousand': 'K', 'thousands': 'K', 'million': 'M', 'millions': 'M', 'billion': 'B', 'billions': 'B', 'trillion': 'TR', 'trillions': 'TR' } SIGNS = {'$': '$', 'usd': '$'} QUANTITIES_LIST = ['K', 'M', 'B', 'TR', 'TRX', 'TRXX'] def __init__(self, config): self.with_stem = config.get_toStem() self.stemmer = Stemmer() self.stop_words = stopwords.words('english') self.stop_words.extend([ r' ', r'', r"", r"''", r'""', r'"', r"“", r"”", r"’", r"‘", r"``", r"'", r"`", '"' ]) self.stop_words.extend([ 'rt', r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '$', '.', r'\'s', '\'s', '\'d', r'\'d', r'n\'t' ]) self.stop_words.extend(['1️⃣.1️⃣2️⃣']) self.stop_words_dict = dict.fromkeys(self.stop_words) # for avg self.total_len_docs = 0 self.number_of_documents = 0 self.url_pattern = re.compile('http\S+') self.url_www_pattern = re.compile("[/://?=]") # TODO - fix numbers pattern self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*')) self.non_latin_pattern = re.compile( pattern= r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]' ) self.dates_pattern = re.compile( r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$' ) # TODO - fix emoji to include all emojis self.emojis_pattern = re.compile( pattern="[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00010000-\U0010ffff" u"\U0001f926-\U0001f937" u"\U000024C2-\U0001F251" u"\U00002702-\U000027B0" u"\u2640-\u2642" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" u"\u3030" u"\u2600-\u2B55" u"\uFE0F\u20E3\uFE0F\u20E3\uFE0F\u20E3" "]+", flags=re.UNICODE) def parse_hashtag(self, all_tokens_list, token): if len(token) <= 1: return t = [] # --> #stay_at_home if '_' in token: t.append('#' + re.sub(r'_', '', token)) t += re.split(r'_', token) else: # --> #stayAtHome if not token.isupper(): t.append('#' + token) t += re.findall('[A-Z][^A-Z]*', token) # --> #ASD else: all_tokens_list.append('#' + token) return t = [x.lower() for x in t] all_tokens_list += t def parse_numbers(self, all_tokens_list, token, before_token, after_token, text_tokens): def helper(num): count = -1 while num >= 1000: num /= 1000 count += 1 # fixed the case of 140.000K if num.is_integer(): num = int(num) return num, count return ("%.3f" % num), count if '/' in token: all_tokens_list.append(token) return if ',' in token: token = token.replace(',', '') try: token = float(token) except: # from this type - 10.07.2020 all_tokens_list.append(token) return if token.is_integer(): token = int(token) b_tok = None is_pers = None if before_token and before_token in Parse.SIGNS: b_tok = Parse.SIGNS[before_token] if after_token: after_token = after_token.lower() if after_token in Parse.QUANTITIES: if token < 1000: if b_tok: all_tokens_list.append(b_tok + str(token) + Parse.QUANTITIES[after_token]) return else: all_tokens_list.append( str(token) + Parse.QUANTITIES[after_token]) return # if we have after and token > 1000 num, count = helper(token) i = Parse.QUANTITIES_LIST.index( Parse.QUANTITIES[after_token]) + 1 count = count + i if count > 2: count = count - 2 while (count > 0): num = float(num) * 1000 count -= 1 if num.is_integer(): num = int(num) all_tokens_list.append(str(num) + 'B') return else: after_token = Parse.QUANTITIES_LIST[count] all_tokens_list.append(str(num) + after_token) return if after_token == 'percent' or after_token == 'percentage' or after_token == '%': is_pers = True if token < 1000: final_t = str(token) else: num, count = helper(token) try: # more then B if count > 2: count = count - 2 while (count > 0): num = float(num) * 1000 count -= 1 if num.is_integer(): num = int(num) final_t = str(num) + 'B' else: after = Parse.QUANTITIES_LIST[count] final_t = str(num) + after except: pass if b_tok: all_tokens_list.append(b_tok + str(final_t)) elif is_pers: all_tokens_list.append(str(final_t) + '%') else: all_tokens_list.append(str(final_t)) def parse_sentence(self, text): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ tokenized_text = [] text_tokens = word_tokenize(text) entity = '' entity_counter = 0 entities_set = set() small_big_dict = {} for i, token in enumerate(text_tokens): if token == ' ': continue # EMOJIS - extract the token without the emojis if re.match(self.emojis_pattern, token): token = self.emojis_pattern.sub(r'', token) tokenized_text.append(token.lower()) entity = '' entity_counter = 0 continue if token == '@': if i < (len(text_tokens) - 1): tokenized_text.append(token + text_tokens[i + 1]) text_tokens[i + 1] = ' ' # skip the next token entity = '' entity_counter = 0 continue if token == '#': if i < (len(text_tokens) - 1): self.parse_hashtag(tokenized_text, text_tokens[i + 1]) text_tokens[i + 1] = ' ' # skip the next token entity = '' entity_counter = 0 continue # DATES date_match = self.dates_pattern.match(token) if date_match: tokenized_text.append(token) # NUMBERS # number_match = self.numbers_pattern_1.match(token) or self.numbers_pattern_2.match(token) number_match = self.numbers_pattern.match(token) if number_match != None: # Numbers over TR if len(token) > 18: tokenized_text.append(token) entity = '' entity_counter = 0 continue start, stop = number_match.span() if (stop - start) == len(token): before_t = None after_t = None if i < (len(text_tokens) - 1): after_t = text_tokens[i + 1] if i > 0: before_t = text_tokens[i - 1] self.parse_numbers(tokenized_text, token, before_t, after_t, text_tokens) entity = '' entity_counter = 0 continue url_match = self.url_pattern.match(token) if url_match: if i + 2 < len(text_tokens): if text_tokens[i + 2]: tokenized_text += self.parse_url(text_tokens[i + 2]) text_tokens[i + 1] = ' ' # skip the next token text_tokens[i + 2] = ' ' # skip the next token entity = '' entity_counter = 0 continue # ENTITY AND SMALL_BIG if token.isalpha() and token.lower() not in self.stop_words_dict: if token[0].isupper(): entity += token + ' ' entity_counter += 1 continue else: # entity dict -> decide >= 2 is an entity if entity_counter > 1: # self.entities.append(entity[:-1]) entities_set.add(entity[:-1]) tokenized_text.append(entity[:-1]) entity = '' entity_counter = 0 continue # small_big dict for entity elif entity_counter == 1: entity = entity[:1] if entity not in small_big_dict.keys(): small_big_dict[token.lower()] = False # now we have small letter token if token not in small_big_dict.keys( ) or not small_big_dict[token]: small_big_dict[token.lower()] = True if '-' in token: tokenized_text.append(token) split_tok = [t.lower() for t in token.split('-')] tokenized_text += split_tok continue # append all regular words suffix = "…" if self.with_stem: token = self.stemmer.stem_term(token) token = token.lower() if token not in self.stop_words_dict and not token.endswith( suffix) and token != suffix and len(token) > 1: tokenized_text.append(token) return tokenized_text, entities_set, small_big_dict def parse_url(self, token): split_url = self.url_www_pattern.split(token) if 't.co' in split_url or 'twitter.com' in split_url: return [split_url[-1].lower()] if len(split_url) > 3 and 'www.' in split_url[3]: split_url[3] = split_url[3][4:] return [t.lower() for t in split_url if (t != 'https' and t != '')] def get_urls(self, all_urls): urls = {} for url in all_urls: if url: urls.update(dict(json.loads(url))) return urls def get_texts(self, all_texts): final_text = "" for text in all_texts: if text: final_text += ' ' + text return final_text def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] tweet_date_obj = datetime.strptime(tweet_date, '%a %b %d %X %z %Y') full_text = doc_as_list[2] url = doc_as_list[3] # indices = doc_as_list[4] retweet_text = doc_as_list[5] retweet_url = doc_as_list[6] # retweet_indices = doc_as_list[7] quote_text = doc_as_list[8] quote_url = doc_as_list[9] # quote_indice = doc_as_list[10] retweet_quoted_text = doc_as_list[11] retweet_quoted_urls = doc_as_list[12] # retweet_quoted_indices = doc_as_list[13] term_dict = {} tokenized_text = [] # parse all urls urls = self.get_urls( [url, retweet_url, quote_url, retweet_quoted_urls]) for (key, value) in urls.items(): if value: tokenized_text += self.parse_url(value) elif key: tokenized_text += self.parse_url(key) all_texts = self.get_texts( [full_text, quote_text, retweet_quoted_text]) # remove urls from text, only if exist in url if len(urls) > 0: all_texts = self.url_pattern.sub('', all_texts) all_texts = self.non_latin_pattern.sub('', all_texts) tokenized_text, entities_set, small_big = self.parse_sentence( all_texts) unique_terms = set(tokenized_text) doc_length = len(tokenized_text) # after text operations. max_tf = 1 # save only tf for each term in tweet for index, term in enumerate(tokenized_text): if term not in term_dict: term_dict[term] = 1 else: term_dict[term] += 1 if term_dict[term] > max_tf: max_tf = term_dict[term] self.total_len_docs += doc_length self.number_of_documents += 1 # TODO - check if we need to save tokenized_text document = Document(tweet_id, max_tf, entities_set, small_big, unique_terms, tweet_date_obj, term_dict, doc_length) return document
class Parse: THOUSAND = 1000 MILLION = 1000000 BILLION = 1000000000 TRILLION = 1000000000000 QUANTITIES = { 'thousand': 'K', 'thousands': 'K', 'million': 'M', 'millions': 'M', 'billion': 'B', 'billions': 'B', 'trillion': 'TR', 'trillions': 'TR' } SIGNS = {'$': '$', 'usd': '$'} QUANTITIES_LIST = ['K', 'M', 'B', 'TR', 'TRX', 'TRXX'] def __init__(self, config): self.with_stem = config.get_toStem() self.stemmer = Stemmer() self.stop_words = stopwords.words('english') self.stop_words.extend(['RT']) self.stop_words_dict = dict.fromkeys(self.stop_words) # for avg self.total_len_docs = 0 self.number_of_documents = 0 self.url_removal_pattern = re.compile(r'(https?://[^\s]+)') # TODO - fix numbers pattern self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*')) self.dates_pattern = re.compile( r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$' ) # i changed this for more informative words (still in comments) def parse_hashtag(self, all_tokens_list, token): if len(token) <= 1: return t = [] # --> #stay_at_home if '_' in token: t += re.split(r'_', token) else: # --> #stayAtHome if not token.isupper(): t += re.findall('[A-Z][^A-Z]*', token) # --> #ASD else: # all_tokens_list.append('#' + token) if self.with_stem: token = self.stemmer.stem_term(token) if len(token) == 1: return all_tokens_list.append(token.lower()) return if self.with_stem: t = [self.stemmer.stem_term(x) for x in t] else: t = [x.lower() for x in t] if '' in t: t.remove('') for term in t: if len(term) > 1: all_tokens_list.append(term) def parse_numbers(self, all_tokens_list, token, before_token, after_token, text_tokens): def helper(num): count = -1 while num >= 1000: num /= 1000 count += 1 # fixed the case of 140.000K if num.is_integer(): num = int(num) return num, count return ("%.3f" % num), count if '/' in token: all_tokens_list.append(token) return if ',' in token: token = token.replace(',', '') try: token = float(token) except: # from this type - 10.07.2020 all_tokens_list.append(token) return if token.is_integer(): token = int(token) b_tok = None is_pers = None if before_token and before_token in Parse.SIGNS: b_tok = Parse.SIGNS[before_token] if after_token: after_token = after_token.lower() if after_token in Parse.QUANTITIES: if token < 1000: if b_tok: all_tokens_list.append(b_tok + str(token) + Parse.QUANTITIES[after_token]) return else: all_tokens_list.append( str(token) + Parse.QUANTITIES[after_token]) return # if we have after and token > 1000 num, count = helper(token) i = Parse.QUANTITIES_LIST.index( Parse.QUANTITIES[after_token]) + 1 count = count + i if count > 2: count = count - 2 while (count > 0): num = float(num) * 1000 count -= 1 if num.is_integer(): num = int(num) all_tokens_list.append(str(num) + 'B') return else: after_token = Parse.QUANTITIES_LIST[count] all_tokens_list.append(str(num) + after_token) return if after_token == 'percent' or after_token == 'percentage' or after_token == '%': is_pers = True if token < 1000: final_t = str(token) else: num, count = helper(token) try: # more then B if count > 2: count = count - 2 while (count > 0): num = float(num) * 1000 count -= 1 if num.is_integer(): num = int(num) final_t = str(num) + 'B' else: after = Parse.QUANTITIES_LIST[count] final_t = str(num) + after except: pass if b_tok: all_tokens_list.append(b_tok + str(final_t)) elif is_pers: all_tokens_list.append(str(final_t) + '%') else: all_tokens_list.append(str(final_t)) def is_cool(self, token): if type(token) == int: return True if len(token) == 0: return False if token in self.stop_words_dict: return False return all((ord(char) > 32) and (ord(char) < 128) for char in token) def parse_sentence(self, text): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ tokenized_text = [] text_tokens = word_tokenize(text) entities_set = set() small_big_dict = {} skip = False for i, token in enumerate(text_tokens): if (skip): skip = False continue if self.is_cool(token): if token == '#': if i < (len(text_tokens) - 1): self.parse_hashtag(tokenized_text, text_tokens[i + 1]) skip = True # DATES date_match = self.dates_pattern.match(token) if date_match: tokenized_text.append(token) # NUMBERS number_match = self.numbers_pattern.match(token) if number_match != None: # Numbers over TR if len(token) > 18: tokenized_text.append(token) start, stop = number_match.span() if (stop - start) == len(token): before_t = None after_t = None if i < (len(text_tokens) - 1): after_t = text_tokens[i + 1] if i > 0: before_t = text_tokens[i - 1] self.parse_numbers(tokenized_text, token, before_t, after_t, text_tokens) if ('.' in token) and (len(token) > 1) and any(c.isalpha() for c in token): tokenized_text.append(token) if '-' in token and len(token) > 1: if token == '--': continue if self.with_stem: token = self.stemmer.stem_term(token) tokenized_text.append(token.lower()) if token.isalpha( ) and token not in self.stop_words_dict and token.lower( ) not in self.stop_words_dict and len(token) > 1: if token not in self.stop_words_dict and len(token) > 1: if self.with_stem: token = self.stemmer.stem_term(token) tokenized_text.append(token.lower()) return tokenized_text, entities_set, small_big_dict def url_parse(self, token): domain = token.split("//")[-1].split("/")[0].split('?')[0] if 'www' in domain and 'com' in domain: domain = domain.split('.') return domain[1] def get_urls(self, all_urls): urls = {} for url in all_urls: if url: urls.update(dict(json.loads(url))) return urls def get_texts(self, all_texts): final_text = "" for text in all_texts: if text: final_text += ' ' + text return final_text def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] tweet_date_obj = datetime.strptime(tweet_date, '%a %b %d %X %z %Y') full_text = doc_as_list[2] url = doc_as_list[3] retweet_url = doc_as_list[6] quote_text = doc_as_list[8] quote_url = doc_as_list[9] retweet_quoted_text = doc_as_list[11] retweet_quoted_urls = doc_as_list[12] term_dict = {} tokenized_text = [] # parse all urls urls = self.get_urls( [url, retweet_url, quote_url, retweet_quoted_urls]) for (key, value) in urls.items(): if value: domain = self.url_parse(value) if domain: tokenized_text += domain all_texts = self.get_texts( [full_text, quote_text, retweet_quoted_text]) # remove urls from the text all_texts = self.url_removal_pattern.sub('', all_texts) tokenized_text, entities_set, small_big = self.parse_sentence( all_texts) unique_terms = set(tokenized_text) doc_length = len(tokenized_text) # after text operations. max_tf = 1 # save only tf for each term in tweet for index, term in enumerate(tokenized_text): if term not in term_dict: term_dict[term] = 1 else: term_dict[term] += 1 if term_dict[term] > max_tf: max_tf = term_dict[term] self.total_len_docs += doc_length self.number_of_documents += 1 # TODO - check if we need to save tokenized_text document = Document(tweet_id, max_tf, entities_set, small_big, unique_terms, tweet_date_obj, term_dict, doc_length) return document