Пример #1
0
    def add_new_doc(self, document, documents_list_length=10000):
        """
        This function perform indexing process for a document object.
        Saved information is captures via two dictionaries ('inverted index' and 'posting')
        :param document: a document need to be indexed.
        :return: -
        """

        try:
            document_dictionary = document.term_doc_dictionary
            # self.countDoc += 1
            for term in document_dictionary.keys():
                if self.stemming == 'y':
                    my_stemmer = Stemmer()
                    term = my_stemmer.stem_term(term)
                    # Update inverted index and posting
                if term not in self.inverted_idx.keys():
                    self.inverted_idx[term] = [
                        1, [(document_dictionary[term], document.tweet_id)]
                    ]  # amount of doc, freq in the doc, doc id.

                else:
                    self.inverted_idx[term][0] += 1  # amount of doc
                    self.inverted_idx[term][1].append(
                        (document_dictionary[term],
                         document.tweet_id))  # freq in the doc # doc id

                if term not in self.postingDict.keys():
                    self.postingDict[term] = [(document.tweet_id,
                                               document_dictionary[term])]
                else:
                    self.postingDict[term].append(
                        (document.tweet_id, document_dictionary[term]))
                # self.countTweet -= 1

                if document.tweet_id not in self.tweet_dict.keys():
                    self.tweet_dict[document.tweet_id] = [
                        [term, document_dictionary[term]], 1, 0
                    ]  # [term,freq in tweet], amount of unique terms in tweet, amount of terms in tweet
                elif document_dictionary[term] > self.tweet_dict[
                        document.tweet_id][0][
                            1]:  # tweet exist, compering between freq in two terms
                    if self.tweet_dict[document.tweet_id][0][
                            1] == 1:  # before change term check if the last term is unique
                        self.tweet_dict[document.tweet_id][
                            1] += 1  # last term is unique: add to the amount of uniqe terms in tweet
                    self.tweet_dict[document.tweet_id][0] = [
                        term, document_dictionary[term]
                    ]  # change between the terms
                    self.tweet_dict[document.tweet_id][2] += 1
                elif document_dictionary[
                        term] == 1:  # tweet exist, not most common, check if unique
                    self.tweet_dict[document.tweet_id][1] += 1
                    self.tweet_dict[document.tweet_id][2] += 1
        except:
            # print('problem in indexer : add_new_doc')
            # print(traceback.print_exc())
            pass
    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """

        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        indice = doc_as_list[4]
        retweet_text = doc_as_list[5]
        retweet_url = doc_as_list[6]
        retweet_indice = doc_as_list[7]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        quoted_indice = doc_as_list[10]
        retweet_quoted_text = doc_as_list[11]
        retweet_quoted_url = doc_as_list[12]
        retweet_quoted_indice = doc_as_list[13]

        term_dict = {}

        tokenized_text = self.parse_sentence(full_text)
        tokenized_quote = self.parse_sentence(quote_text)
        tokenized_url = self.handle_url(url)

        doc_length = len(
            tokenized_text)  # after text operations - length of full_text

        new_tokenized_text = tokenized_text + tokenized_url + tokenized_quote

        if self.stemming is True:
            s = Stemmer()
            for token in new_tokenized_text:
                new_tokenized_text.append(s.stem_term(token))
                new_tokenized_text.remove(token)

        for term in new_tokenized_text:
            if term is not "":  # or (term.isalpha() and len(term) == 1)
                if term not in term_dict:
                    term_dict[term] = 1
                else:
                    term_dict[term] += 1

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length)

        return document
Пример #3
0
class Parse:
    def __init__(self, stem):
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            'ourselves', 'hers', 'between', 'yourself', 'but', 'again',
            'there', 'about', 'once', 'during', 'out', 'very', 'having',
            'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its',
            'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off',
            'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the',
            'themselves', 'until', 'below', 'are', 'we', 'these', 'your',
            'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more',
            'himself', 'this', 'down', 'should', 'our', 'their', 'while',
            'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when',
            'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in',
            'will', 'on', 'does', 'yourselves', 'then', 'that', 'because',
            'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he',
            'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself',
            'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if',
            'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how',
            'further', 'was', 'here', 'than', 'rt', "don't", '-', '&',
            'it’s', 'don’t', 'i’m', "it's", "doesn't", 'https', 't.co',
            'twitter.com', 'weve', 'ur', 'due', 'damn', 'us', 'theyre',
            'would', 'might'
        ])
        self.stop_words_dict = {
            self.stop_words[i]: 0
            for i in range(0, len(self.stop_words))
        }
        # self.extra_stop_words = {"rt": 0, "https": 0, "t.co": 0, "twitter.com": 0, "weve": 0, "ur": 0, "due": 0, "damn": 0, "us": 0, "theyre": 0, "would": 0, "might": 0}
        # self.stop_words_dict.update(self.extra_stop_words)
        self.term_dict = {}
        self.toStem = stem
        self.text_tokens = []
        if self.toStem:
            self.stemmer = Stemmer()

    def parse_sentence(self, text):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """

        text_splitted = text.split()
        # stop_words = stopwords.words('english')
        ##lancaster = LancasterStemmer()
        i = 0
        while i < len(text_splitted):
            try:
                word = text_splitted[i].strip('[').strip(']').strip('(').strip(
                    ')').strip('{').strip('}')
                word = re.sub('[^A-z-%_@#.,$!?/0-9]', '', word)
                if word[len(word) - 1] == '%':
                    new_word = word[:len(word) - 1]
                    if new_word.isdigit() or re.search(
                            r'^-?[0-9]+\.[0-9]+$', new_word) or re.search(
                                r'^-?[0-9]+\/[0-9]+$', new_word):
                        number = self.parse_numbers(new_word)
                        percent_number = str(number) + '%'
                        self.text_tokens.append(percent_number)
                        i += 1
                        continue
                    else:
                        word = re.sub('[^A-z.%0-9]', '', word)
                        if word != '':
                            self.text_tokens.append(word)
                    i += 1
                    continue
                elif word.isdigit() or re.search(
                        r'^-?[0-9]+\.[0-9]+$', word) or re.search(
                            r'^-?[0-9]+\/[0-9]+$', word):
                    if i < len(text_splitted) - 1:
                        next_word = re.sub('[^A-z%_@#.,!?$/0-9]', '',
                                           text_splitted[i + 1])
                        number = self.parse_numbers(word, next_word)
                        if number.endswith('K') or number.endswith(
                                'B') or number.endswith('M'):
                            i += 1
                        elif (next_word == 'percent') or (next_word
                                                          == 'percentage'):
                            number = str(word) + '%'
                            i += 1
                        self.text_tokens.append(number)
                        i += 1
                    else:
                        number = self.parse_numbers(word)
                        self.text_tokens.append(number)
                        i += 1
                    continue
            except:
                ## token is not a number
                word = re.sub('[^A-z-%_@#.,$!?/0-9]', '', text_splitted[i])

            if word.startswith('http') or word.startswith('www'):
                i += 1
                continue

            word = re.sub(r'([-?!/,.]+)', r',', word)
            words = word.split(',')
            for word in words:
                if (len(word) > 0) and (
                        word.isspace()
                        == False) and word.lower() not in self.stop_words_dict:
                    if (word[0] == '#'):
                        word = word[1:]
                        hashtags = word.split('#')
                        for h in hashtags:
                            h = re.sub('[^A-z_0-9]', '', h)
                            if h != '':
                                self.parse_hashtags(h)
                    elif word[0] == '@':
                        word = word[1:]
                        tags = word.split('@')
                        for t in tags:
                            t = re.sub('[^A-z_0-9]', '', t)
                            if t != '':
                                self.parse_tags(t)
                    elif word[0] == '"' or word[0] == "'" or word[
                            0] == '‘' or word[0] == '’':
                        iterations = self.parse_quote(word, i, text_splitted)
                        i += iterations
                        continue
                    else:
                        word = re.sub('[^A-Za-z$%0-9]', '', word)
                        if word != '':
                            if self.toStem:
                                self.text_tokens.append(
                                    self.stemmer.stem_term((word)))
                            else:
                                self.text_tokens.append(word)
            i += 1
        return self.text_tokens
        ##print(self.text_tokens)

    def parse_tags(self, word):
        temp = re.sub('[^A-Za-z$0-9]', '', word)
        if temp != '':
            t_word = '@' + str(word.lower())
            self.text_tokens.append(t_word)

    def parse_quote(self, word, i, text_splitted):

        start_iterations = i
        word = str(word)
        if word[len(word) - 1] == '"' or word[len(word) - 1] == "'" or word[
                len(word) - 1] == '‘' or word[len(word) - 1] == '’':
            self.text_tokens.append(
                word.upper().strip('"').strip('"').strip('‘'))
        else:
            quote = word
            while True:
                if i < len(text_splitted) - 1:
                    next_word = re.sub('[^A-z%_@#.,!?$/0-9]', '',
                                       text_splitted[i + 1])
                    if len(next_word) == 0:
                        i += 1
                    elif (next_word[len(next_word) - 1] == "'") or (
                            next_word[len(next_word) - 1] == '"'
                    ) or (next_word[len(next_word) - 1]
                          == '‘') and (next_word[len(next_word) - 1] == '’'):
                        quote += ' ' + next_word
                        self.text_tokens.append(quote.upper().strip('"').strip(
                            "'").strip('‘').strip('’'))
                        i += 1
                        break
                    else:
                        quote += ' ' + next_word
                        i += 1
                elif i == (len(text_splitted) - 1):
                    self.text_tokens.append(quote.upper().strip('"').strip(
                        "'").strip('‘').strip('’'))
                    break

        return i - start_iterations + 1

    def parse_hashtags(self, element):

        element = element.replace(' ', '')
        expanded = " ".join(
            [a for a in re.split('([A-Z][a-z]+)', element) if a])
        hashtag_tokens = expanded.split(' ')
        for w in hashtag_tokens:
            if w != '' and '_' not in w:
                if self.toStem:
                    self.text_tokens.append(self.stemmer.stem_term((w)))
                else:
                    self.text_tokens.append(w)
        word = re.sub('[^A-z$_0-9]', '', element)
        temp = re.sub('[^A-Za-z%$0-9]', '', word)
        if temp != '':
            self.text_tokens.append('#' + element)

    def parse_url(self, url):
        name = ''
        for character in url:
            if character == ' ':
                break
            if ('a' <= character <= 'z') or ('A' <= character <= 'Z') or (
                    '0' <= character <= '9') or (character == '.'):
                name += character
            elif (len(name) > 1) or ((len(name) == 1) and
                                     ('a' <= name <= 'z') or
                                     ('A' <= name <= 'Z') or
                                     ('0' <= name <= '9')):
                ##if name.isdigit():
                ##  name = self.parse_numbers(name)
                if name.lower() not in self.stop_words_dict and name != ' ':
                    if name not in self.term_dict:
                        self.term_dict[name] = 1
                    else:
                        self.term_dict[name] += 1
                name = ''
        if (len(name) > 1) or ((len(name) == 1) and ('a' <= name <= 'z') or
                               ('A' <= name <= 'Z') or ('0' <= name <= '9')):
            ##if name.isdigit():
            ##  name = self.parse_numbers(name)
            if name.lower() not in self.stop_words_dict and name != ' ':
                if name not in self.term_dict:
                    self.term_dict[name] = 1
                else:
                    self.term_dict[name] += 1

    def parse_numbers(self, item, next_i=''):

        r = ['', 'K', 'M', 'B']

        if bool(re.search(r'^-?[0-9]+\.[0-9]+$', item)):
            return item
        elif bool(re.search(r'^-?[0-9]+\/[0-9]+$',
                            next_i)) and float(item) <= 999:
            return item + ' ' + next_i
        elif bool(re.search(r'^-?[0-9]+\/[0-9]+$', item)):
            return item
        elif (next_i == "Thousand"
              or next_i == "thousand") and float(item) <= 9999:
            return item + "K"
        elif (next_i == "M" or next_i == "m" or next_i == "Million"
              or next_i == "million") and float(item) <= 9999:
            return item + "M"
        elif (next_i == "B" or next_i == "b" or next_i == "Billion"
              or next_i == "billion") and float(item) <= 9999:
            return item + "B"

        num = float(item)
        magnitude = 0
        while abs(num) >= 1000:
            magnitude += 1
            num /= 1000.0
            if magnitude >= 3:
                break
        return str("%.3f" % num).rstrip("0").rstrip(".") + '' + str(
            r[magnitude])

    def parse_doc(self, doc_as_list):
        """commi
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """

        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        retweet_text = doc_as_list[4]
        retweet_url = doc_as_list[5]
        quote_text = doc_as_list[6]
        quote_url = doc_as_list[7]
        self.term_dict = {}
        self.text_tokens = []
        self.parse_sentence(full_text)

        doc_length = len(self.text_tokens)  # after text operations.4

        for term in self.text_tokens:
            if term not in self.term_dict:
                self.term_dict[term] = 1
            else:
                self.term_dict[term] += 1

        num_of_uniqe_terms = len(self.term_dict)

        max_tf = 0
        for item in self.term_dict.values():
            if item > max_tf:
                max_tf = item

        if (url is not None) and (url != '{}'):
            self.parse_url(url)

        if (quote_text is not None) and (quote_text != '{}'):
            self.parse_url(quote_text)

        str_retweet_url = str(retweet_url)
        url_retweet_url_index = str_retweet_url.find('https')
        if url_retweet_url_index != -1:
            url_retweet_url = str_retweet_url[url_retweet_url_index:]
            if (url_retweet_url is not None) and (url_retweet_url != '{}'):
                self.parse_url(url_retweet_url)

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, self.term_dict,
                            doc_length, max_tf, num_of_uniqe_terms,
                            self.text_tokens)

        return document
Пример #4
0
class Parse:
    """
    Parsing, syntax analysis, or syntactic analysis is the process of analyzing a string of symbols, either in natural language,
     computer languages or data structures, conforming to the rules of a formal grammar.
    The term parsing comes from Latin pars
    """
    def __init__(self, config=None):
        self.tmp_for_entites = {}
        self.stop_words = stopwords.words('english') + [
            '?', '!', ',', '+', '-', '*', '/', '"', '.', '<', '>', '=', ':',
            '', '{', '{}', '}', '[', ']', '[]', 'are', 'and', 'an', 'at', 'am',
            'a', 'even', 'every', 'everyone', 'rt', 'RT'
        ]
        self.global_dict = {}  #value=number of docs
        self.post_dict = {
        }  # key="word",value=[parquet name,index in parquet,tweet id,frequency in tweet,location in tweet,tf]
        self.entities = {}
        self.path_stop_words = [
            'RT', "rt", 'tweet', 'www', 'http', 'https', 'WWW'
        ]
        self.corona_list = [
            "cov", 'corona', 'coronavirus', 'covid', 'covid19', 'covid 19',
            'corona virus', 'virus corona', 'corona_virus', 'virus_corona',
            "virus"
        ]
        self.config = config
        self.trump = [
            "donald", "donald trump", "trump donald", "president",
            "trump_donald", "donald_trump", "trump-donald", "donald-trump"
        ]
        self.stemmer = None
        if self.config.toStem:
            self.stemmer = Stemmer()

    def parse_sentence(self, sentence):
        if (sentence == None):
            return
        return self.tokenized_parse(sentence)

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """
        local_dict = {
        }  # key="word",value=[parquet name,index in parquet,tweet id,frequency in tweet,location in tweet]
        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        retweet_text = doc_as_list[4]
        retweet_url = doc_as_list[5]
        quote_text = doc_as_list[6]
        quote_url = doc_as_list[7]
        #if str(full_text).startswith("RT"): #if the tweet is RT and not hold more text (just share) pass
        #    return False
        term_dict = {}
        url = self.parse_url(url)
        tokenized_text = self.tokenized_parse(full_text) + url

        doc_length = len(tokenized_text)  # after text operations.
        unique_words = set()
        for i in range(doc_length):
            if len(tokenized_text[i]) <= 1:
                continue
            unique_words.add(tokenized_text[i])
            term_dict = self.update_doc_dict(term_dict,
                                             tokenized_text[i].lower())

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length)
        return document

    def update_entity_global_dict(self):
        tmp = sorted(self.entities.items(), key=lambda x: x[1], reverse=True)
        entity = []
        for i in tmp:
            if tmp[i][1] < 2:
                entity = tmp[:i]
        for word in entity:
            if word[0] not in self.global_dict:
                self.global_dict[word[0]] = word[1]
            else:
                self.global_dict[word[0]] += word[1]
            self.entities.pop(word[0])

    def update_entity_dict(self, term):
        if term in self.tmp_for_entites.keys():
            self.tmp_for_entites[term] += 1
        else:
            self.tmp_for_entites[term] = 1

    def extand_contractions(self, word):
        '''
         function extand contraction and Common Acronyms in Twitter
        :param word:
        :return:
        '''
        contractions = {
            "ain't": "am not / are not",
            "aren't": "are not / am not",
            "can't": "cannot",
            "can't've": "cannot have",
            "'cause": "because",
            "could've": "could have",
            "couldn't": "could not",
            "couldn't've": "could not have",
            "didn't": "did not",
            "doesn't": "does not",
            "don't": "do not",
            "hadn't": "had not",
            "hadn't've": "had not have",
            "hasn't": "has not",
            "haven't": "have not",
            "he'd": "he had / he would",
            "he'd've": "he would have",
            "he'll": "he shall / he will",
            "he'll've": "he shall have / he will have",
            "he's": "he has / he is",
            "how'd": "how did",
            "how'd'y": "how do you",
            "how'll": "how will",
            "how's": "how has / how is",
            "i'd": "I had / I would",
            "i'd've": "I would have",
            "i'll": "I shall / I will",
            "i'll've": "I shall have / I will have",
            "i'm": "I am",
            "i've": "I have",
            "isn't": "is not",
            "it'd": "it had / it would",
            "it'd've": "it would have",
            "it'll": "it shall / it will",
            "it'll've": "it shall have / it will have",
            "it's": "it has / it is",
            "let's": "let us",
            "ma'am": "madam",
            "mayn't": "may not",
            "might've": "might have",
            "mightn't": "might not",
            "mightn't've": "might not have",
            "must've": "must have",
            "mustn't": "must not",
            "mustn't've": "must not have",
            "needn't": "need not",
            "needn't've": "need not have",
            "o'clock": "of the clock",
            "oughtn't": "ought not",
            "oughtn't've": "ought not have",
            "shan't": "shall not",
            "sha'n't": "shall not",
            "shan't've": "shall not have",
            "she'd": "she had / she would",
            "she'd've": "she would have",
            "she'll": "she shall / she will",
            "she'll've": "she shall have / she will have",
            "she's": "she has / she is",
            "should've": "should have",
            "shouldn't": "should not",
            "shouldn't've": "should not have",
            "so've": "so have",
            "so's": "so as / so is",
            "that'd": "that would / that had",
            "that'd've": "that would have",
            "that's": "that has / that is",
            "there'd": "there had / there would",
            "there'd've": "there would have",
            "there's": "there has / there is",
            "they'd": "they had / they would",
            "they'd've": "they would have",
            "they'll": "they shall / they will",
            "they'll've": "they shall have / they will have",
            "they're": "they are",
            "they've": "they have",
            "to've": "to have",
            "wasn't": "was not",
            "we'd": "we had / we would",
            "we'd've": "we would have",
            "we'll": "we will",
            "we'll've": "we will have",
            "we're": "we are",
            "we've": "we have",
            "weren't": "were not",
            "what'll": "what shall / what will",
            "what'll've": "what shall have / what will have",
            "what're": "what are",
            "what's": "what has / what is",
            "what've": "what have",
            "when's": "when has / when is",
            "when've": "when have",
            "where'd": "where did",
            "where's": "where has / where is",
            "where've": "where have",
            "who'll": "who shall / who will",
            "who'll've": "who shall have / who will have",
            "who's": "who has / who is",
            "who've": "who have",
            "why's": "why has / why is",
            "why've": "why have",
            "will've": "will have",
            "won't": "will not",
            "won't've": "will not have",
            "would've": "would have",
            "wouldn't": "would not",
            "wouldn't've": "would not have",
            "y'all": "you all",
            "y'all'd": "you all would",
            "y'all'd've": "you all would have",
            "y'all're": "you all are",
            "y'all've": "you all have",
            "you'd": "you had / you would",
            "you'd've": "you would have",
            "you'll": "you shall / you will",
            "you'll've": "you shall have / you will have",
            "you're": "you are",
            "you've": "you have",
            "AFK": "Away From Keyboard",
            "BBIAB": "Be Back In A Bit",
            "BBL": "Be Back Later",
            "BBS ": "Be Back Soon",
            "BEG": "Big Evil Grin",
            "BRB": "Be Right Back",
            "BTW": "By The Way",
            "EG": "Evil Grin",
            "FISH": "First In, Still Here",
            "IDK": "I Don't Know",
            "IMO": "In My Opinion",
            "IRL": "In Real Life",
            "KISS": "Keep It Simple,Stupid",
            "LMK": "Let Me Know",
            "LOL": "Laughing Out Loud",
            "NYOB": " None of Your Business",
            "OFC ": "Of Course",
            "OMG ": "Oh My God",
            "PANS": "Pretty Awesome New Stuff",
            "PHAT": "Pretty, Hot, And Tempting",
            "POS ": "Parents Over Shoulder",
            "ROFL": "Rolling On the Floor Laughing",
            "SMH ": "Shaking My Head",
            "TTYL": "Talk To You Later",
            "YOLO": "You Only Live Once",
            "WTH ": "What The Heck",
        }
        if (word in contractions):
            return contractions[word]
        return word

    def deEmojify(self, text):
        "remove the emojipy"
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
            "]+",
            flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def parse_url(self, url_string):
        """
        This function takes a  url_string from document and break it into to list of word :
        https://www.instagram.com/p/CD7fAPWs3WM/?igshid=o9kf0ugp1l8x ->[https, www, instagram.com, p, CD7fAPWs3WM, igshid, o9kf0ugp1l8x ]
        :param tag: Hashtag word from tweet.
        :return: list include spread world from the url .
        """
        if str(url_string).__contains__('t.co') or str(
                url_string).__contains__('twitter') or len(url_string) < 3:
            return []
        tmp_word = ""
        word_list = [url_string]
        url = url_string.replace("//", "/")
        for i in range(len(url)):
            if (url[i] == "/" or url[i] == "-" or url[i] == "_"):
                word_list.append(tmp_word)
                tmp_word = ""
            elif i != len(url) - 1:
                tmp_word = tmp_word + url[i]
            else:

                word_list.append(tmp_word)
                if len(word_list) > 2:
                    word_list = word_list[2:]

        return word_list

    def truncate(self, number, digits) -> float:
        stepper = 10.0**digits
        return math.trunc(stepper * number) / stepper

    def fix_number(self, toc_text):
        """
        convert
        3000 ->3K
        3,000,000->3m

        :param toc_text: get the tokenizerd text
        :return:
        """

        for i in range(len(toc_text)):
            num = toc_text[i]
            num = num.replace(',', '')
            if (num.isnumeric()):
                flag = False
                for digit in range(len(num)):
                    if (num[digit].isdigit() == False and num[digit] != '.'):
                        flag = True
                if (flag):
                    continue
                try:
                    num = float(num)
                except:
                    continue
                flag1 = False
                if (1000 <= num < 1000000):
                    flag1 = True
                    num = num / 1000
                    num = str(self.truncate(num, 3)) + "K"

                elif (1000000 <= num < 1000000000):
                    flag1 = True
                    num = num / 1000000
                    num = str(self.truncate(num, 3)) + "M"
                elif (num > 1000000000):
                    flag1 = True
                    num = num / 1000000000
                    num = str(self.truncate(num, 3)) + "B"
                num = str(num)
                if (flag1 == False):
                    if (num[-1] == "0"):
                        num = num[0:-1]
                        if (num[-1] == "."):
                            num = num[0:-1]
                if (flag):
                    if (num[-2] == "0"):
                        num = num[0:-2] + num[-1:]
                        if (num[-1] == "."):
                            num = num[0:-2] + num[-1:]

                toc_text[i] = num

                if (i + 1 == len(toc_text)):
                    break
                else:
                    if (toc_text[i + 1] == "Thousand"
                            or toc_text[i + 1] == "thousand"):
                        toc_text[i] = str(toc_text[i]) + "K"
                        toc_text[i + 1] = ""
                    elif (toc_text[i + 1] == "Million"
                          or toc_text[i + 1] == "million"):
                        toc_text[i] = str(toc_text[i]) + "M"
                        toc_text[i + 1] = ""
                    elif (toc_text[i + 1] == "Billion"
                          or toc_text[i + 1] == "billion"):
                        toc_text[i] = str(toc_text[i]) + "B"
                        toc_text[i + 1] = ""
        return toc_text

    def update_doc_dict(self, term_dict, word):
        #try:
        if word not in term_dict:
            term_dict[word] = 1
        else:
            #except:
            term_dict[word] += 1
        return term_dict

    def update_global_dict(self, word):
        """
        cheack if word in the dict if not save
        :param word:
        :return:
        """
        if word not in self.global_dict:
            self.global_dict[word] = 1
        else:
            self.global_dict[word] += 1

    def Hashtags_parse(self, toc_text):
        """
        This function takes a  Hashtag world from document and break it into to list of word
        :param tag: Hashtag word from tweet.
        :return: list include spread world and #tag .
        """

        copy_toc_text = []
        for term in toc_text:
            copy_toc_text.append(term)
        count = 0
        parseList = ''
        i = 0
        for term in toc_text:
            count += 1
            tag = term
            flag = True
            if (len(tag) <= 0 or tag[0] != '#'):
                continue
            parseList = tag[1:]
            parseList = str.replace(parseList, '_', '')
            #parseList = re.sub(r"([A-Z])", r" \1", parseList)
            #parseList=self.sub_by_upper(parseList)
            #secparseList = parseList.replace(' ', '')
            split_tag = self.sub_by_upper(parseList) + [
                '#' + parseList.lower()
            ]
            if ('' in split_tag):
                split_tag.remove('')
                count -= 1

            i = count + i
            for word in split_tag:
                copy_toc_text.insert(i, word)
                i += 1
                if (i - count == len(split_tag)):
                    copy_toc_text.remove(term)
            i = i - count
            # term_dict = self.update_doc_dict(term_dict, word)
            # if (flag):
            #     flag = False
            #     self.upper_lower_global_dict(word)
        return copy_toc_text

    def percent_parse(self, toc_text):
        """
        This function change the representation of Number%,Number percent,Number percentage to Number%
        :param s:  word from tweet.
        :return:string in Format  Number% .
        """
        percent_op = [' percentage', ' PERCENTAGE', ' PERCENT', ' percent']
        for i in range(0, len(toc_text)):
            if (str.isnumeric(toc_text[i]) and i + 1 < len(toc_text)
                    and toc_text[i + 1] in percent_op):
                toc_text[i] = toc_text[i] + '%'
                toc_text[i + 1] = ""
                #term_dict = self.update_doc_dict(term_dict, toc_text[i] + '%')
                #self.upper_lower_global_dict(toc_text[i] + '%')
        return toc_text

    def currency_parse(self, term):
        """
              This function converting string currency to multiple ways to show it
              :param sentence:  thw sentece we look up for currency show
              :return:same sentence with extends, $-->$,usd,us dollar .
              """
        t = term.upper()
        currency_dict = {
            'ALL': 'Albania Lek',
            'AFN': 'Afghanistan Afghani',
            'ARS': 'Argentina Peso',
            'AWG': 'Aruba Guilder',
            'AUD': 'Australia Dollar',
            'AZN': 'Azerbaijan New Manat',
            'BSD': 'Bahamas Dollar',
            'BBD': 'Barbados Dollar',
            'BDT': 'Bangladeshi taka',
            'BYR': 'Belarus Ruble',
            'BZD': 'Belize Dollar',
            'BMD': 'Bermuda Dollar',
            'BOB': 'Bolivia Boliviano',
            'BAM': 'Bosnia and Herzegovina Convertible Marka',
            'BWP': 'Botswana Pula',
            'BGN': 'Bulgaria Lev',
            'BRL': 'Brazil Real',
            'BND': 'Brunei Darussalam Dollar',
            'KHR': 'Cambodia Riel',
            'CAD': 'Canada Dollar',
            'KYD': 'Cayman Islands Dollar',
            'CLP': 'Chile Peso',
            'CNY': 'China Yuan Renminbi',
            'COP': 'Colombia Peso',
            'CRC': 'Costa Rica Colon',
            'HRK': 'Croatia Kuna',
            'CU': 'Cuba Peso',
            'CZK': 'Czech Republic Koruna',
            'DKK': 'Denmark Krone',
            'DOP': 'Dominican Republic Peso',
            'XCD': 'East Caribbean Dollar',
            'EGP': 'Egypt Pound',
            'SVC': 'El Salvador Colon',
            'EEK': 'Estonia Kroon',
            'EUR': 'Euro Member Countries',
            'FKP': 'Falkland Islands (Malvinas) Pound',
            'FJD': 'Fiji Dollar',
            'GHC': 'Ghana Cedis',
            'GIP': 'Gibraltar Pound',
            'GTQ': 'Guatemala Quetzal',
            'GGP': 'Guernsey Pound',
            'GYD': 'Guyana Dollar',
            'HNL': 'Honduras Lempira',
            'HKD': 'Hong Kong Dollar',
            'HUF': 'Hungary Forint',
            'ISK': 'Iceland Krona',
            'INR': 'India Rupee',
            'IDR': 'Indonesia Rupiah',
            'IRR': 'Iran Rial',
            'IMP': 'Isle of Man Pound',
            'ILS': 'Israel Shekel',
            'JMD': 'Jamaica Dollar',
            'JPY': 'Japan Yen',
            'JEP': 'Jersey Pound',
            'KZT': 'Kazakhstan Tenge',
            'KPW': 'Korea (North) Won',
            'KRW': 'Korea (South) Won',
            'KGS': 'Kyrgyzstan Som',
            'LAK': 'Laos Kip',
            'LVL': 'Latvia Lat',
            'LBP': 'Lebanon Pound',
            'LRD': 'Liberia Dollar',
            'LTL': 'Lithuania Litas',
            'MKD': 'Macedonia Denar',
            'MYR': 'Malaysia Ringgit',
            'MUR': 'Mauritius Rupee',
            'MXN': 'Mexico Peso',
            'MNT': 'Mongolia Tughrik',
            'MZN': 'Mozambique Metical',
            'NAD': 'Namibia Dollar',
            'NPR': 'Nepal Rupee',
            'ANG': 'Netherlands Antilles Guilder',
            'NZD': 'New Zealand Dollar',
            'NIO': 'Nicaragua Cordoba',
            'NGN': 'Nigeria Naira',
            'NOK': 'Norway Krone',
            'OMR': 'Oman Rial',
            'PKR': 'Pakistan Rupee',
            'PAB': 'Panama Balboa',
            'PYG': 'Paraguay Guarani',
            'PEN': 'Peru Nuevo Sol',
            'PHP': 'Philippines Peso',
            'PLN': 'Poland Zloty',
            'QAR': 'Qatar Riyal',
            'RON': 'Romania New Leu',
            'RUB': 'Russia Ruble',
            'SHP': 'Saint Helena Pound',
            'SAR': 'Saudi Arabia Riyal',
            'RSD': 'Serbia Dinar',
            'SCR': 'Seychelles Rupee',
            'SGD': 'Singapore Dollar',
            'SBD': 'Solomon Islands Dollar',
            'SOS': 'Somalia Shilling',
            'ZAR': 'South Africa Rand',
            'LKR': 'Sri Lanka Rupee',
            'SEK': 'Sweden Krona',
            'CHF': 'Switzerland Franc',
            'SRD': 'Suriname Dollar',
            'SYP': 'Syria Pound',
            'TWD': 'Taiwan New Dollar',
            'THB': 'Thailand Baht',
            'TTD': 'Trinidad and Tobago Dollar',
            'TRY': 'Turkey Lira',
            'TRL': 'Turkey Lira',
            'TVD': 'Tuvalu Dollar',
            'UAH': 'Ukraine Hryvna',
            'GBP': 'United Kingdom Pound',
            'USD': 'United States Dollar',
            'UYU': 'Uruguay Peso',
            'UZS': 'Uzbekistan Som',
            'VEF': 'Venezuela Bolivar',
            'VND': 'Viet Nam Dong',
            'YER': 'Yemen Rial',
            'ZWD': 'Zimbabwe Dollar'
        }
        if t in currency_dict:
            return currency_dict[t]
        return term

    def update_post_dict(self, tweet_id, local_dict, term_dict, tweet_date):
        """
        update the post dict
        :param tweet_id: tweet ID int
        :param local_dict: dict hold the loction
        :param term_dict: dict hold frequency
        :param tweet_date:
        :return:
        """
        max_tf = max(term_dict.values())
        for term in term_dict:
            tf = term_dict[term] / max(term_dict.values())
            if term not in self.post_dict:
                self.post_dict[term] = [
                    [
                        tweet_id, term_dict[term], tf, local_dict[term][1],
                        len(term_dict), max_tf, tweet_date
                    ]
                ]  #[ tweetID,trem preq,tf,term location,num uniqe terms in tweet,max_tf,date]
            else:
                self.post_dict[term].append([
                    tweet_id, term_dict[term], tf, local_dict[term][1],
                    len(term_dict), max_tf, tweet_date
                ])

    def get_global_dict(self):
        dict = self.global_dict
        self.global_dict = {}
        return dict

    def get_posting_dict(self):
        dict = self.post_dict
        self.post_dict = {}
        return dict

    def sub_by_upper(self, text):
        """
        cut long word to lst that the first word start with upper
        :param text:long word
        :return: lst  that the first word start with uppe
        """
        parseList = []
        tmp = []
        word = ""
        for i in range(len(text)):
            if text[i].isupper():
                tmp.append(i)
        for i in range(len(tmp) - 1):
            word = text[tmp[i]:tmp[i + 1]]
            parseList.append(word.lower())
        if (len(tmp) > 0):
            text = text[tmp[-1]:]
            parseList.append(text.lower())
        return parseList

    def update_entity_dict(self, term):
        """
        update num of show of the entity
        :param term:
        :return:
        """
        if term in self.tmp_for_entites.keys():
            self.tmp_for_entites[term] += 1
        else:
            self.tmp_for_entites[term] = 1

    def find_entities(self, tokenized_text):
        """
        if the function recognize up 2 word start with upper
        :param tokenized_text: list after tokenized
        :return:
        """

        UPPER_letter = False
        tmp_entity = ""
        for idx, word in enumerate(tokenized_text):
            if len(word) < 1:
                continue
            elif len(tmp_entity.split()) >= 2:
                self.update_entity_dict(tmp_entity)
                tmp_entity = ""
                UPPER_letter = False
            elif word[0].isupper() and UPPER_letter == True:
                tmp_entity += " " + word
                if (idx == len(tokenized_text) - 1):
                    self.update_entity_dict(tmp_entity)
            elif word[0].isupper() and UPPER_letter == False:
                UPPER_letter = True
                tmp_entity += word

            else:
                tmp_entity = ""

    def tokenized_parse(self, full_text):
        """

        :param full_text: the original text
        :return: list of term without stop words+@term+ #terms without emojify
        """
        full_text = self.deEmojify(full_text)
        tokenized_text = full_text.split(' ')
        tokenized_text_copy = []
        for term in tokenized_text:

            if term.lower() in self.trump:
                tokenized_text_copy.append("trump")
                tokenized_text[tokenized_text.index(term)] = "trump"
                continue
            tokenized_text_copy.append(term)

        for i in tokenized_text:
            if i.lower(
            ) in self.stop_words or i in self.path_stop_words or i.startswith(
                    "\n") or i.startswith(
                        "https") or len(i) < 2:  #remove from original
                tokenized_text_copy.remove(i)
                continue
            idx = tokenized_text_copy.index(i)
            if '.' in i:
                tokenized_text_copy[idx] = tokenized_text_copy[idx].replace(
                    ".", '')
            if ',' in i:
                tokenized_text_copy[idx] = tokenized_text_copy[idx].replace(
                    ",", '')
            tokenized_text_copy[idx] = self.extand_contractions(
                tokenized_text_copy[idx].lower())
            tokenized_text_copy[idx] = self.currency_parse(
                tokenized_text_copy[idx])

        tokenized_text = tokenized_text_copy
        # save #tag
        tokenized_text = self.Hashtags_parse(tokenized_text)
        # save numbers end with M K B
        tokenized_text = self.fix_number(tokenized_text)
        # save num%
        tokenized_text = self.percent_parse(tokenized_text)
        # save entity
        self.find_entities(tokenized_text)

        try:
            if self.stemmer != None:
                for i in range(len(tokenized_text)):
                    tokenized_text[i] = self.stemmer.stem_term(
                        tokenized_text[i])

        except:
            pass

        return tokenized_text

    def get_entity_dict(self):
        dict = self.entities
        self.entities = {}
        return dict
class Parse:
    def __init__(self):
        self.stop_words = stopwords.words('english')
        self.dictionary_term_index = {}
        self.array_names_and_entities = {}
        self.porter_stemmer = Stemmer()

    def parse_sentence(self, text, stemmer=False):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """

        list_percent = ["percent", "Percent", "Percentage", "percentage"]
        self.array_names_and_entities = {}
        self.dictionary_index = {}
        text = text.replace("\n", ". ")
        text = self.ignore_emojis(text)
        array_text_space = text.split(" ")
        array_text_space = self.separate_words_with_dots(array_text_space)
        string_ans = ""
        array_size = range(len(array_text_space))
        string_ans_index = 0
        entities_url = []  # help us to replace the url to "" because in get_entities it returns parts of the url
        for word, idx in zip(array_text_space, array_size):
            ans = ""
            if word == '' or word == ' ': continue
            check_digit = self.isdigit(word)
            if len(word) < 2 and check_digit is False: continue
            if len(word) < 2 or self.is_ascii(word) is False:
                if check_digit is False:
                    word = self.remove_panctuation(word)
                    if self.is_ascii(word) is False or word == '' or word == " " or len(
                            word) < 2 or word.lower() not in self.stop_words:
                        continue
            if ans == "" and self.is_url(word):
                entities_url.append(word)
                if "t.co" in word: continue
                ans = self.parse_url(word)
                if ans == "":
                    entities_url.remove(word)
                    continue
            else:
                if ans == "" and len(word) < 2 and word[0] != '#' and self.is_ascii(word) and not self.isfloat(word):
                    word = self.remove_panctuation(word)
            if ans == "" and word[0] == '#':
                temp_word = self.remove_panctuation(word)
                if temp_word == "" or temp_word == "#":
                    continue
                ans = self.parse_hashtag(temp_word)
            elif ans == "" and word[0] == '@':
                ans = self.remove_panctuation(word)
            elif ans == "" and word in list_percent:
                if idx > 0 and self.isfloat(array_text_space[idx - 1]):
                    ans = self.parse_percentage(array_text_space[idx - 1] + " " + word)
                    string_ans = string_ans[:len(string_ans) - 1 - len(ans)] + string_ans[
                                                                               len(string_ans) + len(word):] + " "
                else:
                    ans = word
            elif ans == "" and (word.lstrip('-').isdigit() or self.isfloat(word.lstrip('-')) or self.isFraction(
                    word.lstrip('-')) or word.replace('~', '').isdigit()):
                ans = self.convert_str_to_number(array_text_space, idx)
            if ans == "":
                pre_ans = self.remove_panctuation(word)
                if len(pre_ans) < 2: continue
                array_ans = pre_ans.split()
                continued_array = []
                for word_array_ans in array_ans:

                    splitted_word, is_number = self.split_word_to_numbers_strings(word_array_ans)
                    if splitted_word == '': continue
                    arr = splitted_word.split(" ")
                    for spl in arr:
                        if spl.lower() in self.stop_words or len(word_array_ans) < 2: continue
                        if is_number or self.check_two_letters(spl):
                            spl = self.remove_panctuation_special(spl)
                            string_ans += self.add_to_dictionary(spl, string_ans_index)
                            string_ans_index += len(word) + 1
                            continue
                        else:
                            string_ans += self.add_to_dictionary(word_array_ans.lower(), string_ans_index)
                            string_ans_index += len(word) + 1
            else:
                string_ans += self.add_to_dictionary(ans, string_ans_index)
                string_ans_index += len(word) + 1

        self.get_name_and_entities(entities_url, array_text_space)
        array_parsed = string_ans.split()
        ans = []
        for word in array_parsed:
            if word[0] != '#' and word[0] != '@':
                if self.check_two_letters(word):
                    us_word = self.remove_panctuation_special(word)
                    ans.append(us_word)
                    continue
                ans.append(word)
        return ans, self.array_names_and_entities

    def separate_words_with_dots(self, array_text):
        new_text = ""
        length = range(len(array_text))
        for i in length:
            word = array_text[i]
            if '.' not in word:
                if word == '': continue
                new_text += word + " "
                continue
            if "http" in word or "www" in word or "t.co" in word or self.isfloat(word):
                check_regular_point = word.split('.', 1)
                if check_regular_point[0] != '' and check_regular_point[1] != '' and self.is_url(
                        check_regular_point[1]):
                    new_text += check_regular_point[0] + '. ' + check_regular_point[1]
                    continue
                if check_regular_point[1] == '':
                    new_text += check_regular_point[0] + " "
                    continue
                new_text += word + " "
                continue
            if self.check_two_letters(word):
                us_word = self.remove_panctuation_special(word)
                new_text += us_word + " "
                continue
            separate = str(word).split('.')
            new_text += separate[0] + ". " + separate[1] + " "
        return new_text.lstrip().split(" ")

    def is_url(self, text):
        '''
        check if string is a url path
        :param text: url
        :return: boolean
        '''
        regex = re.compile(
            r'^(?:http|ftp)s?://|(?:www)?.'  # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
            r'localhost|'  # localhost...
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
            r'(?::\d+)?'  # optional port
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)

        return re.match(regex, text) is not None

    def add_to_dictionary(self, text, index):
        array_of_words = text.split(" ")
        ans = ""
        for word in array_of_words:
            ans += word + " "
            self.dictionary_index[word] = index
        if ans == "": return ""
        return ans

    def parse_hashtag(self, phrase):
        """"
        parser hash tag and lower the letters
        return array of string
        #stayAtHome -> ['#stayathome',stay,at,home]
        """
        original_phrase = phrase
        pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])")
        if phrase[1].islower() and '_' not in original_phrase:
            phrase = phrase[:1] + phrase[1].upper() + phrase[2:]
        temp = pattern.findall(phrase)
        all_words = phrase[1:].split("_")
        for word in all_words:
            if word != phrase[1:] and word.lower() and word not in temp: temp.append(word)
        temp = [str_to_lower.lower() for str_to_lower in temp]
        # temp.insert(0, original_phrase[0:len(original_phrase)].lower().replace('_', ''))
        i = 0
        len_temp = len(temp)
        while i < len_temp:
            if temp[i] in self.stop_words or len(temp[i]) < 2:
                temp[i] = ''
            i += 1
        return " ".join(temp).lstrip().rstrip()

    def parse_url(self, string):
        """
        parsing url path
        return an array of the components
        """
        if string is not None:
            ans = string.split("/")

            ans_len = len(ans)
            remove_www = ""
            if ans_len > 0:
                for term in ans:
                    remove_www += term.replace("www.", "") + " "
                ans[0] = ans[0].replace(ans[0], remove_www)
                string_without_stopword = ""
                length = range(len(ans))
                ans_string = ans[0].split(" ")
                for word, idx in zip(ans_string, length):
                    if word == '' or word == ' ': continue
                    if len(word) < 2 or (len(word) > 0 and word[0] == '#'): continue
                    if word not in self.stop_words or word.isnumeric():
                        if not self.is_url(word):
                            word = self.remove_panctuation(word)
                        string_without_stopword += word + " "
                return string_without_stopword.lstrip()
            else:
                return ""

    def isdigit(self, word):
        if "0" <= word <= "9":
            return True
        return False

    def isfloat(self, value):
        """
            check if value is a float number
        :return: boolean
        """
        try:
            float(value)
            return True
        except ValueError:
            return False

    def isFraction(self, token):
        """
        check if value is a fraction number
        :return: boolean
        """
        if '/' not in token:
            return False
        values = token.split('/')
        return all(i.isdigit() for i in values)

    def convert_str_to_number_kmb(self, word):
        """
                check if value is a float number, and return the wanted number. etc: 1000->1K, 1013456->1.013M
                :return: boolean
                """
        tmb = ''
        if word >= 1000000000 or word <= -1000000000:
            word = float(word / 1000000000)
            tmb = 'B'
        elif word >= 1000000 or word <= -1000000:
            word = float(word / 1000000)
            tmb = 'M'
        elif word >= 1000 or word <= -1000:
            word = float(word / 1000)
            tmb = 'K'
        ans = '{:0.3f}'.format(word)
        return '{0:g}'.format(float(ans)) + tmb

    def check_two_letters(self, word):
        if 0 < len(word) < 7 and (word.upper()[0] == 'U' and 'S' in word.upper()):
            start = word.upper().find('U') + 1
            end = word.upper().find('S', start)
            dot = word[start:end]
            if dot == '.':
                return True

    def split_word_to_numbers_strings(self, word):
        try:
            if self.check_two_letters(word):
                us_word = self.remove_panctuation_special(word)
                return us_word, False
            res = re.findall(r'[A-Za-z]+|\d+', word)
            if len(res)==0: return '', False
            if len(word) > 0 and self.isfloat(res[0]):
                if len(res) > 1 and (
                        "thousand" in res[1].lower() or "million" in res[1].lower() or "billion" in res[1].lower()
                        or "b" in res[1].lower() or "m" in res[1].lower() or "k" in res[1].lower()):
                    if "thousand" in word.lower(): return word.replace(res[1], "K"), True
                    if 'k' in word.lower(): return word.replace(res[1], "K"),True
                    if "million" in word.lower(): return word.replace(res[1], "M"),True
                    if 'm' in word.lower(): return word.replace(res[1], "M"),True
                    if "billion" in word.lower(): return word.replace(res[1], "B"),True
                    if 'b' in word.lower(): return word.replace(res[1], "B"),True
                else:
                    return (" ".join(res).lstrip().rstrip(),True)
            else:
                is_number =False
                return (" ".join(res).lstrip().rstrip(),False)
        except:
            return word, False

    def convert_str_to_number(self, text_demo, idx):
        """
        check every type of number and return it as a string. etc: 1K,1M,1B,-900,23/5,2020,2K
        :return: boolean
        """
        help_minus = ''
        text_return = []
        my_word = text_demo[idx]
        text_demo_length = len(text_demo)
        my_word = my_word.replace(",", "")
        if re.search('-', my_word):
            help_minus = '-'
            my_word = my_word.replace("-", "")
        if not self.isfloat(my_word): my_word = self.remove_panctuation(my_word)
        if self.isFraction(my_word):
            if idx + 1 == text_demo_length:
                return ''.join(help_minus + my_word)
            text_return = ''.join(help_minus + my_word)
            token_next = text_demo[idx + 1].lower()
            if token_next == "billion" or token_next == "billions":
                text_return += 'B'
                text_demo[idx + 1] = ""
            if token_next == "million" or token_next == "millions":
                text_return += 'M'
                text_demo[idx + 1] = ""
            if text_demo[idx + 1] == "thousand" or token_next == "thousands":
                text_return += 'K'
                text_demo[idx + 1] = ""
            return help_minus + ''.join(text_return)
        if my_word != '' and not math.isnan(float(my_word)):
            number = float(my_word)
            number_numerize = self.convert_str_to_number_kmb(number)
            if idx + 1 < len(text_demo):
                token_next = text_demo[idx + 1].lower()
                number_to_input = str(number_numerize)
                if token_next == "billion" or token_next == "billions":
                    if 'K' in number_numerize or 'M' in number_numerize:
                        number_to_input = (number_to_input.translate({ord('K'): None}))
                        number_to_input = (number_to_input.translate({ord('M'): None}))
                        text_return.append(my_word)
                    else:
                        text_return.append(str(number_numerize + 'B'))
                    text_demo[idx + 1] = ""

                elif token_next == "million" or token_next == "millions":
                    if 'K' in number_numerize:
                        number_to_input = (number_to_input.translate({ord('K'): None}))
                        text_return.append(number_to_input + 'B')
                    else:
                        number_to_input = str(number_numerize)
                        text_return.append(number_to_input + 'M')
                    text_demo[idx + 1] = ""
                elif token_next == "thousand" or token_next == "thousands":
                    if 'K' in number_numerize:
                        number_to_input = (number_to_input.translate({ord('K'): None}))
                        text_return.append(number_to_input + 'M')
                    elif 'M' in number_numerize:
                        number_to_input = (number_to_input.translate({ord('M'): None}))
                        text_return.append(number_to_input + 'B')
                    else:
                        text_return.append(number_to_input + 'K')
                    text_demo[idx + 1] = ""
                elif 1000 > number > -1000:
                    text_return.append(number_numerize)
                else:
                    text_return.append(number_numerize)
            else:
                text_return.append(number_numerize)
            if 1900 < number < 2100 and help_minus == '':
                if '~' in text_demo[idx]:
                    text_return.append(my_word)
                else:
                    len_number = len(text_demo[idx])
                    if text_demo[idx][len_number - 1] == '.':
                        res = my_word.replace('.','')
                        text_return.append(res)
                    else:
                        text_return.append(text_demo[idx])
        return help_minus + ' '.join(text_return)

    def ignore_emojis(self, text):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
        ans = emoji_pattern.sub(r'', text)
        return ans

    def is_ascii(self, s):
        ans = all(ord(c) < 128 or c == '…' or c == '’' or c == '³' or c == "¹⁹" for c in s)
        return ans

    def parse_percentage(self, string):
        """
        change word to percent
        100 percent -> 100%
        :param string: string to check if there is a percent within
        :return: array of converted strings
        """
        return re.split('\s+', string)[0] + '%'

    def remove_panctuation_special(self, word):
        """
                remove pancuations from word U.S (like U.S., or U.S.'s)
                :param word
                :return: word without panctuation
                """

        if 'a' in word.lower():
            temp = word[:5]
            to_pancuate = word.replace(temp, '')
            # word = word.lower().replace("u.s", '')
            word = temp + self.remove_panctuation(to_pancuate)
            return word
        else:
            temp = word[:3]
            to_pancuate = word.replace(temp, '')
            # word = word.lower().replace("u.s", '')
            word = temp + self.remove_panctuation(to_pancuate.lower())
            return word

    def remove_panctuation(self, word):
        """
                remove pancuations from word (like . or , or : )
                :param word
                :return: word without panctuation
                """
        if self.check_two_letters(word):
            #word = self.remove_panctuation_us(word)
            return word
        if re.match(r'[^@]+@[^@]+\.[^@]+', word): return word
        if "#" == word or "##" == word: return ""
        if word[-2:] == "'s" or word[-2:] == "’s" or word[-2:] == "`s": word = word.replace(word[-2:], "")
        smiles = [":)", ":(", ":-]", ":-)", ";)", ";-)", ":-(", ";(", ";-(", ":-P", ":P", ":p", ":-p"]
        for smile in smiles:
            if smile in word: word = word.replace(smile, "")
        if word in smiles: return ''
        if "\n" in word: word = word.replace("\n", " ")
        if '#' in word and word[0] != '#': word = word.replace("#", "")
        if '_' in word and '#' not in word:
            word = word.replace("_", "")
        if '@' in word and word[0] != '@': word = word.replace("@", "")

        word = word.replace("-", " ")
        word = word.replace("'", "")
        word = re.sub(r'[€£€4️⃣“”‘⁦⁩‼⑥²⁸¹❶❷❽②⑦&$~’.,!…|?,…:;^"{}*=+()⁰\/[\[\]]', '', word)
        return word

    def get_name_and_entities(self, entities_url, array_text_space):
        text = ""
        for word in array_text_space:
            if word == '' or word == '' or word[0] == '@' or word[0] == '#' or word == "RT": continue
            text += word + " "

        rx2 = re.compile(r'[A-Z][-a-zA-Z]+[1-9]*(?:\s+[A-Z][-a-zA-Z]+[1-9]*)*')
        matches = rx2.findall(text)
        tokinzed_entity_new = set()
        i = 0
        for i in range(len(matches)):
            if len(str(matches[i]).split()) > 1:
                tokinzed_entity_new.add(str(matches[i]))
                i += 1
        if "COVID 19" in text: tokinzed_entity_new.add("COVID 19")
        if "Covid 19" in text: tokinzed_entity_new.add("Covid 19")

        for word in tokinzed_entity_new:
            if word.lower() not in self.stop_words:
                all_places = [m.start() for m in re.finditer(word, text)]
                self.array_names_and_entities[word] = all_places
        return tokinzed_entity_new

    def parse_doc(self, doc_as_list, stemmer=False):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """
        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        indices = doc_as_list[4]
        retweet_text = doc_as_list[5]
        retweet_url = doc_as_list[6]
        retweet_indices = doc_as_list[7]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        quote_indices = doc_as_list[10]
        term_dict = {}
        entities_local_dict = {}
        array_url_parsed = []
        url = str(url)
        rt = False
        if "RT" in full_text:
            rt = True

        tokenized_text, names_and_entities = self.parse_sentence(full_text, stemmer=False)

        doc_length = len(tokenized_text)  # after text operations.
        if doc_length == 0:
            return None

        for term in tokenized_text:
            if len(term) < 2:
                continue
            elif term.isdigit() and len(term) > 3:
                continue
            if stemmer:
                term = self.porter_stemmer.stem_term(term)
            if term not in term_dict.keys():
                term_dict[term] = 1
            else:
                term_dict[term] += 1

        for term in names_and_entities.keys():
            if len(term) < 2: continue
            if term in self.stop_words:
                continue
            if term not in term_dict.keys():
                term_dict[term] = 1
            else:
                term_dict[term] += 1

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text,
                            quote_url, term_dict, len(self.array_names_and_entities), rt, doc_length)
        return document
Пример #6
0
class Parse:
    __slots__ = [
        'word_dict', 'stemmer', 'stop_words', 'rules', 'spell', 'min_length'
    ]

    def __init__(self, config):
        self.word_dict = {}
        self.stemmer = Stemmer(config.stemming)
        self.stop_words = [
            self.stemmer.stem_term(word) for word in stopwords.words('english')
        ] + ['rt', 't.co', 'https']
        self.rules = config.parser_rules
        self.spell = SpellChecker()
        self.min_length = config.min_length

    # helper function for numberTostring-->return 3 digit after the point
    @staticmethod
    def round_down(n, decimals=0):
        multiplier = 10**decimals
        return math.floor(n * multiplier) / multiplier

    @staticmethod
    def isNumber(word):
        return '0' <= word[0] <= '9'

    def numberToString(self, num):
        if num < 1000:
            return str(num)
        elif 1000 <= num < 1000000:
            num = num / 1000
            num = self.round_down(num, 3)
            if num == int(num):
                num = int(num)
            s = str(num)
            return s + 'K'
        elif 1000000 <= num < 1000000000:
            num = num / 1000000
            num = self.round_down(num, 3)
            if num == int(num):
                num = int(num)
            s = str(num)
            return s + 'M'
        else:
            num = num / 1000000000
            num = self.round_down(num, 3)
            if num == int(num):
                num = int(num)
            s = str(num)
            return s + 'B'

    # This function is "cleaning" the word,removing a ,!@$&*... that appear in start/end of word
    @staticmethod
    def strip_punctuations(word):
        if word == '$':
            return word
        start = 0
        end = len(word) - 1
        while start < len(word) and word[start] in (string.punctuation +
                                                    '\n\t'):
            if word[start] == '@' or word[start] == '#' or word[start] == '"':
                break
            start += 1
        while end >= 0 and word[end] in string.punctuation:
            if word[end] == '"' or word[end] == '$':
                break
            end -= 1
        return word[start:end + 1]

    # This function clean the text-->remove if not exist in ascii table
    @staticmethod
    def removeEmojis(text):
        return text.encode('ascii', 'ignore').decode('ascii')

    # #stayAtHome--->['#stayAtHome', 'stay', 'At',Home]
    @staticmethod
    def hashtag(term):
        res = [term]
        start = 1
        for i in range(2, len(term)):
            if term[i].isupper():
                res.append(term[start:i])
                start = i
        res.append(term[start:])
        return res

    @staticmethod
    def URL(text):
        return [v for v in re.split('[://]|[/?]|[/]|[=]', text) if v]

    @staticmethod
    def extendURLs(document):
        url_map = json.loads(document[3])
        url_indices = json.loads(document[4])
        full_text = document[2]
        offset = 0
        for index in url_indices:
            try:
                new_offset = offset + len(url_map[full_text[(index[0] + offset):(index[1] + offset)]]) - index[1] + \
                             index[0]
                full_text = full_text[:(index[0] + offset)] + url_map[
                    full_text[(index[0] + offset):(
                        index[1] + offset)]] + full_text[(index[1] + offset):]
                offset = new_offset
            except:
                pass
        document[2] = full_text

    @staticmethod
    def add_or_inc(d, term):
        if not term:
            return
        elif term not in d:
            d[term] = 0
        d[term] += 1

    def add_to_dict(self, word):
        low_case = word.lower()
        if low_case in self.stop_words:
            return None
        if len(low_case) < self.min_length:
            return None
        if self.rules['capitals']:
            if low_case in self.word_dict.keys():
                if word == low_case:
                    self.word_dict[low_case].text = low_case
            else:
                self.word_dict[low_case] = Term(word)
        else:
            if low_case not in self.word_dict.keys():
                self.word_dict[low_case] = Term(low_case)
        return self.word_dict[low_case]

    def add_entity_to_dict(self, word):
        low_case = word.lower()
        if low_case in self.stop_words:
            return None
        if low_case in self.word_dict.keys():
            self.word_dict[low_case].numOfInterfaces += 1
            if word == low_case:
                self.word_dict[low_case].text = low_case
        else:
            self.word_dict[low_case] = Term(word)
            self.word_dict[low_case].is_entity = True
        return self.word_dict[low_case]

    def Tokenize(self, text):
        output = {}
        if self.rules['spellcheck']:
            word_list = [
                self.spell.correction(word) for word in [
                    self.stemmer.stem_term(self.strip_punctuations(word))
                    for word in text.split()
                ] if word
            ]
        else:
            word_list = [
                word for word in [
                    self.stemmer.stem_term(self.strip_punctuations(word))
                    for word in text.split()
                ] if word
            ]

        size = len(word_list)

        # find all the quotes in this doc
        # re.findall() find all quotes and return a list of quotes without " "
        if self.rules['quotes']:
            quotes = [
                self.add_to_dict('"{}"'.format(quote))
                for quote in re.findall(r'"(.*?)"', text)
            ]
            for q in quotes:
                self.add_or_inc(output, q)

        # The main loop
        for i in range(size):
            word = word_list[i]

            if self.rules['entity']:
                if (i + 1) < size and 'A' <= word[
                        0] <= 'Z' and 'A' <= word_list[i + 1][0] <= 'Z':
                    j = i + 2
                    entity = word + ' ' + word_list[i + 1]
                    self.add_or_inc(output, self.add_entity_to_dict(entity))
                    while j < size and 'A' <= word_list[j][0] <= 'Z':
                        entity = entity + ' ' + word_list[j]
                        self.add_or_inc(output,
                                        self.add_entity_to_dict(entity))
                        j += 1
            if self.rules['less_more']:
                if (i + 1) < size and word.lower() in ['less', 'more']:
                    new_term = f'{word} {word_list[i + 1]}'
                    if word_list[i + 1].lower() == 'than' and i + 2 < size:
                        new_term += f' {word_list[i + 2]}'
                    self.add_or_inc(output, self.add_to_dict(new_term.lower()))
            if self.isNumber(word):
                if self.rules['number']:
                    try:
                        if i + 1 < size and word_list[i + 1].lower() in [
                                self.stemmer.stem_term('percent'),
                                self.stemmer.stem_term('percentage')
                        ]:
                            i += 1
                            word += '%'

                        elif i + 1 < size and word_list[i + 1].lower() in [
                                self.stemmer.stem_term('dollar'),
                                self.stemmer.stem_term('dollars')
                        ]:
                            i += 1
                            word += '$'

                        # check if the number is actually separate to 2 word: 35 3/5
                        elif i + 1 < size and self.isNumber(
                                word_list[i + 1]) and '/' in word_list[i + 1]:
                            word += ' ' + word_list[i + 1]
                        # cases of Thousand=K    Million=M    Billion=B--->the function numberToString do it
                        elif i + 1 < size and word_list[i + 1].lower(
                        ) == self.stemmer.stem_term('thousand'):
                            i += 1
                            word = self.numberToString(float(word) * 1000)
                        elif i + 1 < size and word_list[i + 1].lower(
                        ) == self.stemmer.stem_term('million'):
                            i += 1
                            word = self.numberToString(float(word) * 1000000)
                        elif i + 1 < size and word_list[i + 1].lower(
                        ) == self.stemmer.stem_term('billion'):
                            i += 1
                            word = self.numberToString(
                                float(word) * 1000000000)
                        else:
                            word = self.numberToString(float(word))
                    except:
                        pass
                    self.add_or_inc(output, self.add_to_dict(word))
            # hashtag
            elif word[0] == '#':
                if self.rules['hashtag']:
                    for word in self.hashtag(word):
                        self.add_or_inc(output, self.add_to_dict(word))
            # URL
            elif word[0:4] == "http":
                if self.rules['url']:
                    for word in self.URL(word):
                        self.add_or_inc(output, self.add_to_dict(word))

            # Tag
            elif word[0] == '@':
                if self.rules['tag']:
                    self.add_or_inc(output, self.add_to_dict(word))
            else:
                self.add_or_inc(output, self.add_to_dict(word))
        return output

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-presetting the tweet.
        :return: Document object with corresponding fields.
        """
        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        retweet_text = doc_as_list[4]
        retweet_url = doc_as_list[5]
        quote_text = doc_as_list[6]
        quote_url = doc_as_list[7]

        if self.rules['ext_url']:
            self.extendURLs(doc_as_list)
            full_text = doc_as_list[2]

        if self.rules['emoji']:
            full_text = self.removeEmojis(full_text)

        full_text = full_text.replace('\n', ' ')

        term_dict = self.Tokenize(full_text)

        doc_length = sum(term_dict.values())

        max_word = max(term_dict.values())

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length, max_word)
        return document
Пример #7
0
class Parse:
    # CONSTANTS
    KBM_SHORTCUTS = {
        "k": None,
        "m": None,
        "b": None,
        "K": None,
        "M": None,
        "B": None
    }
    MONTHS_DICT = {"Jul": ("july", "07"), "Aug": ("august", "08")}
    DAYS_DICT = {
        "Sat": "saturday",
        "Sun": "sunday",
        "Mon": "monday",
        "Tue": "tuesday",
        "Wed": "wednsday",
        "Thu": "thursday",
        "Fri": "friday"
    }
    RIGHT_SLASH_PATTERN = re.compile(r'^-?[0-9]+\\0*[1-9][0-9]*$')
    LEFT_SLASH_PATTERN = re.compile(r'^-?[0-9]+/0*[1-9][0-9]*$')
    NON_LATIN_PATTERN = re.compile(
        pattern=
        r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]')
    HASHTAG_SPLIT_PATTERN = re.compile(
        r'[a-zA-Z0-9](?:[a-z0-9]+|[A-Z0-9]*(?=[A-Z]|$))')
    COVID_DICT = {
        'covid': None,
        'covid-19': None,
        'coronavirus': None,
        'covid19': None,
        'chinavirus': None,
        '#covid19': None
    }

    def __init__(self, stemming):
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            'rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m',
            '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re', r' ', r'', r"",
            r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`",
            r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{',
            '}'
            "'&'", '.', r'\'d', '-', '--', 'mask', 'pandemic', 'people',
            'wear', 'trump', 'masks', 'new', 'virus', 'wearing', 'cases',
            'amp', 'us', 'like'
        ])
        # , 'covid', '19', 'covid-19', 'mask', 'coronavirus', 'pandemic', 'people', 'wear', 'trump', 'covid19', 'masks', 'new', 'virus', 'wearing', 'cases', 'amp', '#covid19', 'us', 'like'
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        self.text_tokens = None

        self.stemmer = None
        if stemming:
            self.stemmer = Stemmer()

    def parse_sentence(self, text):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :param capital_letter_indexer: dictionary for words with capital letters
        :param named_entities: dictionary for named entities in doc
        :return:
        """
        self.text_tokens = word_tokenize(text)
        tokenized_list = []
        entity_chunk = ''
        empty_chunk = 0
        capital_letter_indexer = {}
        named_entities = set()

        for idx, token in enumerate(self.text_tokens):

            if token.lower() in self.stop_words_dict or (len(token) == 1
                                                         and ord(token) > 126):
                continue

            if token == '@' and len(self.text_tokens) > idx + 1:
                self.text_tokens[idx + 1] = ''
                continue
            c1 = token[0]
            if (ord(c1) < 48 or 57 < ord(c1) < 65 or 90 < ord(c1) < 97
                    or 122 < ord(c1)) and c1 != '#':
                continue
            if token in self.COVID_DICT:
                tokenized_list.append('covid')
                continue

            if len(token) > 0 and token[0].isupper():
                # chunks entities together.
                entity_chunk += token + " "
                empty_chunk += 1
            else:
                # add entity to the global counter and to the current words set
                if entity_chunk != '':
                    named_entities.add(entity_chunk[:-1])
                    if empty_chunk > 1:
                        tokenized_list.append(entity_chunk[:-1].lower())
                    entity_chunk = ''
                    empty_chunk = 0

            if token == '#':
                self.handle_hashtags(tokenized_list, idx)
            elif self.is_fraction(token):
                self.handle_fraction(tokenized_list, token, idx)
            elif token in ["%", "percent", "percentage"]:
                self.handle_percent(tokenized_list, idx)
            elif token.isnumeric() or "," in token:
                self.handle_number(tokenized_list, idx, token)
            elif '-' in token and len(token) > 1:
                self.handle_dashes(tokenized_list, token)
            elif token == 'https' and idx + 2 < len(self.text_tokens):
                # Will enter only if there are no urls in the dictionaries.
                splitted_trl = self.split_url(self.text_tokens[idx + 2])
                tokenized_list.extend([x.lower() for x in splitted_trl])
                self.text_tokens[idx + 2] = ''
            elif token[
                    -1] in self.KBM_SHORTCUTS and self.convert_string_to_float(
                        token[:-1]):
                tokenized_list.append(token.upper())
            else:
                if self.stemmer is not None:
                    token = self.stemmer.stem_term(token)
                self.append_to_tokenized(tokenized_list,
                                         capital_letter_indexer, token)

        return tokenized_list, capital_letter_indexer, named_entities

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """
        if len(doc_as_list) > 0:
            tweet_id = int(doc_as_list[0])
        else:
            tweet_id = None
        if len(doc_as_list) > 1:
            tweet_date = doc_as_list[1]
        else:
            tweet_date = None
        if len(doc_as_list) > 2:
            full_text = doc_as_list[2]
        else:
            full_text = None
        if len(doc_as_list) > 3:
            url = self.json_convert_string_to_object(doc_as_list[3])
        else:
            url = None
        if len(doc_as_list) > 6:
            retweet_url = self.json_convert_string_to_object(doc_as_list[6])
        else:
            retweet_url = None
        if len(doc_as_list) > 8:
            quote_text = doc_as_list[8]
        else:
            quote_text = None
        if len(doc_as_list) > 9:
            quote_url = self.json_convert_string_to_object(doc_as_list[9])
        else:
            quote_url = None
        if len(doc_as_list) > 12:
            retweet_quoted_url = self.json_convert_string_to_object(
                doc_as_list[12])
        else:
            retweet_quoted_url = None
        if full_text is None or tweet_id is None or tweet_date is None:
            return None
        dict_list = [url, retweet_url, quote_url, retweet_quoted_url]
        max_tf = 0

        # if tweet_id in [1291243586835472384, 1291188776493080576, 1291180630315868162, 1291329776444112902, 1291356400829038592]:
        #     print()

        urls_set = set()
        try:
            # holds all URLs in one place
            for d in dict_list:
                if d is not None:
                    for key in d.keys():
                        if key is not None and d[key] is not None:
                            urls_set.add(d[key])
        except:
            urls_set = set()
        if quote_text is not None:
            full_text = full_text + " " + quote_text
        # removes redundant short URLs from full_text
        if len(urls_set) > 0:
            full_text = self.clean_text_from_urls(full_text)
        # takes off non-latin words.
        full_text = re.sub(self.NON_LATIN_PATTERN, u'', full_text)
        if len(full_text) == 0:
            return None

        tokenized_text, capital_letter_indexer, named_entities = self.parse_sentence(
            full_text)

        if len(tokenized_text) == 0:
            return None
        # tokenized_text.extend([x.lower() for x in self.handle_dates(tweet_date)])
        # expends the full text with tokenized urls
        self.expand_tokenized_with_url_set(tokenized_text, urls_set)
        term_dict = {}
        doc_length = len(tokenized_text)  # after text operations.
        for idx, term in enumerate(tokenized_text):
            if term not in term_dict.keys():
                # holding term's locations at current tweet
                term_dict[term] = 1
            else:
                term_dict[term] += 1
            if term_dict[term] > max_tf:
                max_tf = term_dict[term]

        tweet_date = datetime.strptime(tweet_date, '%a %b %d %X %z %Y')

        document = Document(tweet_id, tweet_date, term_dict, doc_length,
                            max_tf, len(term_dict), capital_letter_indexer,
                            named_entities)
        return document

    def handle_hashtags(self, tokenized_list, idx):
        """
        merges text_tokens[idx] with text_tokens[idx+1] such that '#','exampleText' becomes '#exampleText'
        and inserts 'example' and 'Text' to text_tokens
        :param tokenized_list: list that the terms will be appended to
        :param idx: index of # in text_tokens
        :return:
        """
        if len(self.text_tokens) > idx + 1:
            splitted_hashtags = self.hashtag_split(self.text_tokens[idx + 1])
            # tokenized_list.append((self.text_tokens[idx] + self.text_tokens[idx + 1]).lower())
            tokenized_list.extend([
                x.lower() for x in splitted_hashtags
                if x.lower() not in self.stop_words_dict
            ])
            self.text_tokens[idx + 1] = ''

    def handle_tags(self, tokenized_list, idx):
        """
        merges text_tokens[idx] with text_tokens[idx+1] such that '@','example' becomes '@example'
        :param tokenized_list: list of tokenized words
        :param idx: index of @ in text_tokens
        """

        if len(self.text_tokens) > idx + 1:
            # tokenized_list.append((self.text_tokens[idx] + self.text_tokens[idx + 1]).lower())
            # self.text_tokens[idx] = ''
            self.text_tokens[idx + 1] = ''

    def hashtag_split(self, tag):
        """
        splits a multi-word hash-tag to a list of its words
        :param tag: single hash-tag string
        :return: list of words in tag
        """
        return re.findall(self.HASHTAG_SPLIT_PATTERN, tag)

    def handle_percent(self, tokenized_list, idx):
        """
        merges text_tokens[idx] with text_tokens[idx-1] such that "%"/"percent"/"percentage",'50' becomes '50%'
        :param tokenized_list: list of tokenized words
        :param idx: index of % in text_tokens
        :return:
        """
        if idx is not 0:
            dash_idx = self.text_tokens[idx - 1].find('-')
            if self.is_fraction(self.text_tokens[idx - 1]):
                number = self.text_tokens[idx - 1]
            else:
                number = self.convert_string_to_float(self.text_tokens[idx -
                                                                       1])
            if number is not None:
                if (self.text_tokens[idx - 1].lower() +
                        "%").lower() not in self.stop_words_dict:
                    tokenized_list.append(self.text_tokens[idx - 1].lower() +
                                          "%")
            elif dash_idx != -1:
                left = self.text_tokens[idx - 1][:dash_idx]
                right = self.text_tokens[idx - 1][dash_idx + 1:]
                if left.isnumeric() and right.isnumeric() and (
                        self.text_tokens[idx - 1].lower() +
                        "%") not in self.stop_words_dict:
                    tokenized_list.append(self.text_tokens[idx - 1].lower() +
                                          "%")

    def handle_number(self, tokenized_list, idx, token):
        """
        converts all numbers to single format:
        2 -> 2
        68,800 -> 68.8K
        123,456,678 -> 123.456M
        3.5 Billion -> 3.5B
        :param tokenized_list: list of tokenized words
        :param idx: index of % in text_tokens
        :param token: text_tokens[idx]
        :return:
        """
        number = self.convert_string_to_float(token)
        if number is None:
            tokenized_list.append(token.lower())
            return

        multiplier = 1

        if len(self.text_tokens) > idx + 1:
            if self.text_tokens[idx + 1] in ["%", "percent", "percentage"]:
                return

            if self.text_tokens[idx + 1].lower() in [
                    "thousand", "million", "billion"
            ]:
                if self.text_tokens[idx + 1].lower() == "thousand":
                    multiplier = 1000
                elif self.text_tokens[idx + 1].lower() == "million":
                    multiplier = 1000000
                elif self.text_tokens[idx + 1].lower() == "billion":
                    multiplier = 1000000000
                self.text_tokens[idx + 1] = ''

        number = number * multiplier
        kmb = ""

        if number >= 1000000000:
            number /= 1000000000
            kmb = 'B'

        elif number >= 1000000:
            number /= 1000000
            kmb = 'M'

        elif number >= 1000:
            number /= 1000
            kmb = 'K'

        # if number is not an integer, separates it to integer and fraction
        # and keeps at most the first three digits in the fraction
        if "." in str(number):
            dot_index = str(number).index(".")
            integer = str(number)[:dot_index]
            fraction = str(number)[dot_index:dot_index + 4]

            if fraction == ".0":
                number = integer
            else:
                number = integer + fraction
        else:
            number = str(number)

        tokenized_list.append(number + kmb)

    def convert_string_to_float(self, s):
        """
        tries to convert a string to a float
        if succeeds, returns float
        if fails, returns None
        :param s: string to convert
        :return: float / None
        """
        if "," in s:
            s = s.replace(",", "")
        try:
            number = float(s)
            return number
        except:
            return None

    def split_url(self, url):
        """
        separates a URL string to its components
        ex:
            url = https://www.instagram.com/p/CD7fAPWs3WM/?igshid=o9kf0ugp1l8x
            output = [https, www.instagram.com, p, CD7fAPWs3WM, igshid, o9kf0ugp1l8x]
        :param url: url as string
        :return: list of sub strings
        """
        if url is not None:
            r = re.split('[/://?=]', url)
            if 'twitter.com' in r or 't.co' in r:
                return []
            if len(r) > 3 and 'www.' in r[3]:
                r[3] = r[3][4:]
            return [
                x.lower() for x in r
                if (x != '' and x != 'https' and not x.startswith('#'))
            ]

    def expand_tokenized_with_url_set(self, to_extend, urls_set):
        """
        extends the to_extend list with the parsed values in url_set
        :param to_extend: list of strings to extend
        :param urls_set: a Set containing URL strings
        :return:
        """
        for url in urls_set:
            to_extend.extend(self.split_url(url))

    def take_emoji_off(self, token):
        return self.emoji_pattern.sub(r'', token)

    def json_convert_string_to_object(self, s):
        """
        converts a given string to its corresponding object according to json
        used specifically to dictionaries
        :param s: string to convert
        :return: Object / None
        """
        if s is None or s == '{}':
            return None
        else:
            try:
                return json.loads(s)
            except:
                return None

    def clean_text_from_urls(self, text):
        """
        removes all URLs from text
        :param text: string
        :return: string without urls
        """
        text = re.sub(r'http\S+|www.\S+', '', text)
        return text

    def handle_dashes(self, tokenized_list, token):
        """
        Adds token's words separated to the tokenized list.
        e.g: Word-word will be handled as [Word,word, Word-word]
        :param tokenized_list: list of tokenized words
        :param token: String to separate
        :return: None
        """
        dash_idx = token.find('-')
        after_dash = token[dash_idx + 1:].lower()
        if dash_idx > 0:
            tokenized_list.append(token.lower())
            before_dash = token[:dash_idx].lower()
            if before_dash not in self.stop_words_dict:
                tokenized_list.append(before_dash)
            if after_dash not in self.stop_words_dict:
                tokenized_list.append(after_dash)
        else:
            if after_dash not in self.stop_words_dict:
                tokenized_list.append(after_dash)

    def handle_fraction(self, tokenized_list, token, idx):
        """
        takes care of strings representing fractions
        if there is a number before the fraction, it concats both tokens into one.
        :param tokenized_list: list of tokenized words
        :param token: single word that would be handled
        :param idx: the index of the word in text_tokens
        :return:
        """
        slash_idx = token.find('\\')
        if slash_idx != -1:
            token = token[:slash_idx] + '/' + token[slash_idx + 1:]
        frac = str(Fraction(token))
        if idx == 0 and frac != token and frac.lower(
        ) not in self.stop_words_dict:
            tokenized_list.append(frac.lower())
        else:
            number = self.convert_string_to_float(self.text_tokens[idx - 1])
            if number is not None:
                if (self.text_tokens[idx - 1] + " " +
                        token).lower() not in self.stop_words_dict:
                    tokenized_list.append(
                        (self.text_tokens[idx - 1] + " " + token).lower())
                self.text_tokens[idx] = ''
            elif token != frac:
                if frac.lower() not in self.stop_words_dict:
                    tokenized_list.append(frac.lower())
                if token.lower() not in self.stop_words_dict:
                    tokenized_list.append(token.lower())
            else:
                if token.lower() not in self.stop_words_dict:
                    tokenized_list.append(token.lower())

    def is_fraction(self, token):
        """
        checks whether given token is a fraction.
        :param token: string
        :return: boolean
        """
        return re.match(self.RIGHT_SLASH_PATTERN, token) is not None or \
               re.match(self.LEFT_SLASH_PATTERN, token) is not None

    def handle_dates(self, tweet_date):
        """
        takes tweet's date and parsing it's information into tokenized_list
        :param tweet_date: date in string
        :return: list of parsed information
        """
        splitted_date = tweet_date.split()
        day_num = splitted_date[2]
        month_txt, month_num = self.MONTHS_DICT[splitted_date[1]]
        date_num = day_num + "/" + month_num + "/" + splitted_date[5]
        return [month_txt, date_num, splitted_date[3]]

    def append_to_tokenized(self, tokenized_list, capital_letters, token):
        """
        appends given token to tokenized list and to capital_letters dictionary according to it's first letter.
        :param tokenized_list: list of tokenized words
        :param capital_letters: dictionary containing words and boolean value.
        :param token: given word.
        :return:
        """
        if len(token) > 0 and token[0].isupper():
            if token not in capital_letters:
                capital_letters[token.lower()] = True
        else:
            capital_letters[token.lower()] = False
        if token.lower() not in self.stop_words_dict:
            c1 = token[0]
            if (ord(c1) < 48 or 57 < ord(c1) < 65 or 90 < ord(c1) < 97
                    or 122 < ord(c1)) and c1 != '#':
                return
            elif len(token) == 1 and 48 <= ord(c1) <= 57:
                return
            tokenized_list.append(token.lower())
Пример #8
0
class Parse:

    def __init__(self, with_stemmer=False, include_urls=False, include_quote=False, debug=False, timer=False):
        self.stemmer = Stemmer()
        self.with_stemmer = with_stemmer
        self.include_urls = include_urls
        self.include_quote = include_quote
        self.stop_words = stopwords.words('english')
        self.stop_words += ["i'm", "it's", 'they', "i've", 'you', 'u', 'we', 'rt', 'im', 'use', 'sure', ]
        self.debug = debug
        self.timer = timer
        self.times = []

    def _is_number(self, number):
        return number.replace(',', '').replace('.', '', 1).replace('%', '', 1).replace('$', '', 1).replace('K', '', 1) \
            .replace('M', '', 1).replace('B', '', 1).isdigit()

    def _pre_parse(self, text):
        text = ' '.join([w for w in text.split(' ') if '…' not in w])
        whitespace = ' \t\n\r\v\f'
        ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
        ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        digits = '0123456789'
        # punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
        punctuation = r"""!#$%&'*’+,-./<=>?@[\]^_{|}~"""
        printable = digits + ascii_lowercase + ascii_uppercase + punctuation + whitespace
        text = ''.join([x for x in text if x in printable])

        text = text.replace('\n', ' ')  # remove new lines
        text = re.sub(' +', ' ', text)  # Remove double spaces
        return text

    def _extract_entities(self, text):
        terms = []
        entities_terms = []
        subterm = ''

        for subtext in text.split(','):
            sub_terms = subtext.split(' ')
            for term in sub_terms:
                if not term.replace("'", '').replace('-', '').isalnum(): #Not a word
                    if len(subterm.split(' ')) >= 2:
                        entities_terms.append(subterm)
                    subterm = ''
                elif term[0].upper() == term[0]:
                    if subterm == '':
                        subterm = term.replace('-', ' ')
                    else:
                        subterm += ' ' + term.replace('-', ' ')
                else:
                    if len(subterm.split(' ')) >= 2:
                        entities_terms.append(subterm)
                    subterm = ''
                terms.append(term)

        entities_terms = [term for term in entities_terms if term != '']
        return entities_terms, terms

    def _number_transform(self, term):
        opt_term = term.replace('%', '', 1).replace('$', '', 1).replace('K', '', 1) \
            .replace('M', '', 1).replace('B', '', 1)
        replaced_term_optional = opt_term.replace(',', '')
        if not self._is_number(term.replace(',', '')):
            return term

        if float(replaced_term_optional) < 1000:
            number = round(float(replaced_term_optional), 3)
            if number == float(int(float(replaced_term_optional))):
                number = int(number)
            return term.replace(replaced_term_optional, str(number))

        elif float(replaced_term_optional) < 1000000:
            if term.isdigit() and len(term) == 4 and int(term) > 1500 and int(term) < 2100:  # Maybe an year
                return term
            else:
                number = round(float(replaced_term_optional) / 1000, 3)
                if number == float(float(replaced_term_optional) // 1000):
                    number = int(number)
                return term.replace(opt_term, str(number) + 'K')
        elif float(replaced_term_optional) < 1000 * 1000 * 1000:
            number = round(float(replaced_term_optional) / 1000000, 3)
            if number == float(float(replaced_term_optional) // 1000000):
                number = int(number)
            return term.replace(opt_term, str(number) + 'M')
        elif float(replaced_term_optional) < 1000 * 1000 * 1000 * 1000:
            number = round(float(replaced_term_optional) / 1000000, 3)
            if number == float(float(replaced_term_optional) // 1000000):
                number = int(number)
            return term.replace(opt_term, str(number) + 'B')
        else:
            return term

    def _url_transform(self, url):
        parts = []

        url_parts = url.split('/')
        parts.append(url_parts[0][:-1])

        addr = url_parts[2]
        addr_parts = addr.split('.')
        addr_parts = [addr_parts[0]] + ['.'.join(addr_parts[1:])] if addr_parts[0] == 'www' else ['.'.join(addr_parts)]

        parts = parts + addr_parts + url_parts[3:-1]

        info = url_parts[-1].split('?')

        if len(info) == 1:
            parts = parts + info
        elif len(info) == 3:
            assert 1 == 0
        else:
            parts.append(info[0])

            props = info[1].split('&')
            for prop in props:
                parts = parts + prop.split('=')

        parts = [p for p in parts if p != '']
        return parts

    def remove_comma(self, w):
        w = re.sub('[,]*$', '', w)
        w = re.sub('[.]*$', '', w)
        w = re.sub('^[,]*', '', w)
        w = re.sub('^[.]*', '', w)
        w = re.sub('[:]*$', '', w)
        w = re.sub('[-]+', ' ', w)
        w = re.sub('[’]+', "'", w)
        w = re.sub('[?]*$', '', w)
        w = re.sub('[!]*$', '', w)
        return w

    def _splitHashtags(self, term_):
        for i in range(len(term_) - 1)[::-1]:
            if term_[i].isupper() and term_[i + 1].islower():
                term_ = term_[:i] + ' ' + term_[i:]
            if term_[i].isupper() and term_[i - 1].islower():
                term_ = term_[:i] + ' ' + term_[i:]
        return term_.split()

    def _hashtags_tag_parse(self, tokens):
        result_tokens = []
        rest_tokens = []
        for w in tokens:
            if w[0] == '#':
                for subw in w[1:].split('_'):
                    splited_hashtag = self._splitHashtags(subw)
                    result_tokens += [sub_hashtag.lower() for sub_hashtag in splited_hashtag]
                result_tokens.append(w.replace('_', '').lower())
            elif w[0] == '@':
                result_tokens.append(w)
            else:
                rest_tokens.append(w)
        return result_tokens, rest_tokens

    def _special_parse(self, tokens):
        parse_number_comma_tokens = []
        for w in tokens:
            n_new_text_tokens = len(parse_number_comma_tokens) - 1
            if (w.lower() == 'percent' or w.lower() == 'percentage') and len(parse_number_comma_tokens) != 0 and \
                    self._is_number(parse_number_comma_tokens[n_new_text_tokens]):
                parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + '%'
            elif (w.lower() == 'dollar' or w.lower() == 'dollars') and len(parse_number_comma_tokens) != 0 and \
                    self._is_number(parse_number_comma_tokens[n_new_text_tokens]):
                parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + '$'
            elif w.lower() == 'thousand' and len(parse_number_comma_tokens) != 0 and \
                    self._is_number(parse_number_comma_tokens[n_new_text_tokens]):
                parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + 'K'
            elif (w.lower() == 'million' or w.lower() == 'mill') and len(parse_number_comma_tokens) != 0 and \
                    self._is_number(parse_number_comma_tokens[n_new_text_tokens]):
                parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + 'M'
            elif w.lower() == 'billion' and len(parse_number_comma_tokens) != 0 and \
                    self._is_number(parse_number_comma_tokens[n_new_text_tokens]):
                parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + 'B'
            elif len(w.split('/')) == 2 and w.split('/')[0].isdigit() and len(parse_number_comma_tokens) != 0 and \
                    w.split('/')[1].isdigit() and self._is_number(parse_number_comma_tokens[n_new_text_tokens]):
                parse_number_comma_tokens[n_new_text_tokens] = parse_number_comma_tokens[n_new_text_tokens] + ' ' + w
            else:
                parse_number_comma_tokens.append(w)
        return parse_number_comma_tokens

    def _remove_slashes(self, tokens):
        result_tokens = []
        for token in tokens:
            if len(token.split('/')) == 1:
                result_tokens.append(token)
                continue
            splited = token.split('/')
            if len(splited) == 2 and splited[0].isdigit() and splited[1].isdigit():
                result_tokens.append(token)
            else:
                result_tokens += splited
        return result_tokens

    def _apply(self, func, input):
        end_time, start_time = 0, 0
        if self.timer:
            start_time = time.perf_counter()
            result = func(input)
            end_time = time.perf_counter()
        else:
            result = func(input)

        if self.debug:
            print(result)

        self.times.append(end_time - start_time)
        return result

    def parse_sentence(self, text):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """
        self.timer = True
        self.times = []

        if self.debug:
            print('Text:', text)

        text = self._apply(self._pre_parse, text)
        entities, temp_text_tokens = self._apply(self._extract_entities, text)

        removed_urls_tokens = [w for w in temp_text_tokens if not w.startswith('https')]

        text_tokens = self._apply(self._remove_slashes, removed_urls_tokens)

        remove_comma_terms = [self.remove_comma(term) for term in text_tokens if self.remove_comma(term) != '']
        entities_terms = [self.remove_comma(term) for term in entities if self.remove_comma(term) != '']
        fix_numbers_terms = [self._number_transform(w) for w in remove_comma_terms]

        parse_number_comma_tokens = self._apply(self._special_parse, fix_numbers_terms)

        parse_number_comma_tokens = [w for w in parse_number_comma_tokens if w.lower() not in self.stop_words]

        tokens_parsed, rest_tokens = self._apply(self._hashtags_tag_parse, parse_number_comma_tokens)

        capital_tokens = [token.upper() for token in rest_tokens if token.lower() != token]
        rest_tokens = [token for token in rest_tokens if token.lower() == token]

        if self.with_stemmer:
            rest_tokens = [self.stemmer.stem_term(token) for token in rest_tokens]
        total_tokens = rest_tokens + entities_terms + tokens_parsed + capital_tokens

        if self.debug:
            print('Total tokens:', total_tokens)
        return total_tokens

    def _parse_urls(self, urls):
        urls = urls.replace('null', 'None')
        urls_tokens = [self._url_transform(w) for w in eval(urls).values() if
                       w != '' and w is not None and 'twitter.com' not in w]
        urls_tokens = [item for sublist in urls_tokens for item in sublist]
        return urls_tokens

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """

        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        term_dict = {}

        #print(full_text)

        try:
            tokenized_text = self.parse_sentence(full_text)
        except:
            print(full_text)
            tokenized_text = []
        # print(tokenized_text)
        # print('---------------------------------------------------------')

        if self.include_urls:
            tokenized_text += self._parse_urls(url)

        if self.include_quote and quote_text is not None:
            tokenized_text += self.parse_sentence(quote_text)

        if self.include_quote and self.include_urls and quote_url is not None:
            tokenized_text += self._parse_urls(quote_url)

        doc_length = len(tokenized_text)  # after text operations.

        for term in tokenized_text:
            if term not in term_dict.keys():
                term_dict[term] = 1
            else:
                term_dict[term] += 1

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text=None, retweet_url=None,
                            quote_text=quote_text, quote_url=quote_url, term_doc_dictionary=term_dict,
                            doc_length=doc_length)
        return document
Пример #9
0
class Parse:
    num_of_docs = 0
    total_doc_length = 0
    retweet_dict = {}

    def __init__(self, config=None, advanced=False):
        # stopwords_to_add = ['rt']
        self.english_word = words.words()
        self.stop_words = stopwords.words('english')
        puncs_to_add = ['...', '', '\'', '“', '”', '’', '…']
        self.punctuators = [punc for punc in string.punctuation] + puncs_to_add
        self.tt = TweetTokenizer()
        self.stemmer = Stemmer()
        self.need_stemming = config.toStem if isinstance(
            config, ConfigClass) else False
        self.caps_dict = {}
        self.rules_dict = {}
        self.advanced = advanced

    def parse_sentence(self, text, urls={}):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param urls:
        :param text:
        :return:
        """

        text_tokens = self.tt.tokenize(text)
        text_tokens_after_rules = []

        # regEx patterns
        url_pattern = re.compile(
            r'^http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+'
        )
        hashtag_pattern = re.compile(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)')
        mention_pattern = re.compile(r'(?:@[\w_]+)')
        numbers_pattern = re.compile(r'(?:(?:\d+,?)+(?:\.?\d+)?)')
        fractions_pattern = re.compile(r'(-?\d+)/(-?\d+)')
        emails_pattern = re.compile(r'\w\S*@.*\w')
        english_word_pattern = re.compile(r"[A-Za-z'-]+")

        for i, token in enumerate(text_tokens):
            if token.lower() in self.stop_words + self.punctuators:
                continue

            if self.advanced:
                if "-" in token:  # split hyphen
                    text_tokens_after_rules += token.replace("-", " ").split()

                if "/" in token:  # split hyphen
                    text_tokens_after_rules += token.replace("/", " ").split()

                if token.encode(
                        'ascii',
                        'ignore').decode('ascii') == '':  # remove emoji
                    continue

                if emails_pattern.match(token):  # remove emails
                    continue

            maybe_ent = ''
            if token[0].isupper():
                maybe_ent += token
                text_tokens.remove(token)
                if len(text_tokens) > i:
                    token = text_tokens[i]
                    while token[0].isupper():
                        maybe_ent += ' ' + token
                        text_tokens.remove(token)
                        if len(text_tokens) > i:
                            token = text_tokens[i]
                        else:
                            break
                if maybe_ent[0].isupper():
                    self.caps_dict[maybe_ent.lower()] = False
                    self.check_capital(maybe_ent)
                    if len(maybe_ent.split()) == 1:
                        text_tokens_after_rules += [maybe_ent.lower()]
                    else:
                        text_tokens_after_rules += [maybe_ent.lower()] + [
                            tok.lower() for tok in maybe_ent.split()
                        ]

            if token.lower() in self.stop_words + self.punctuators:
                continue

            if hashtag_pattern.match(token):
                text_tokens_after_rules += self.stemming_rule(
                    self.hashtag_rule(token[1:]))

            elif url_pattern.match(token):  # not use url
                if token in urls:
                    url = urls[token]
                    if url is not None:
                        text_tokens_after_rules += self.URL_rule(url)
                continue

            elif mention_pattern.match(token):
                text_tokens_after_rules += self.stemming_rule([token])

            elif numbers_pattern.match(token):
                if numbers_pattern.match(token).span() == (0, len(token)):
                    if i + 1 < len(text_tokens):
                        if text_tokens[i + 1].lower() in [
                                'percent', 'percentage', '%'
                        ]:
                            per = text_tokens[i + 1]
                            text_tokens_after_rules += [
                                self.numbers_rule(token)[0] + '%'
                            ]
                            text_tokens.remove(per)

                        elif text_tokens[i + 1] in ['$', '¢', '£', '€']:
                            sign = text_tokens[i + 1]
                            text_tokens_after_rules += [
                                sign + self.numbers_rule(token)[0]
                            ]
                            text_tokens.remove(sign)

                        elif text_tokens[i +
                                         1].upper() in ['M', 'KM', 'CM', 'MM']:
                            sign = text_tokens[i + 1]
                            text_tokens_after_rules += [
                                self.numbers_rule(token)[0] + sign.upper()
                            ]
                            text_tokens.remove(sign)

                        elif token.replace('.', '').replace(',', '').isdigit():
                            zeros_dict = {
                                'thousand': '0' * 3,
                                'million': '0' * 6,
                                'billion': '0' * 9
                            }
                            multiplier = text_tokens[i + 1]
                            if multiplier.lower() in zeros_dict.keys():
                                text_tokens_after_rules += self.numbers_rule(
                                    token + zeros_dict[multiplier.lower()])
                                text_tokens.remove(multiplier)

                            elif fractions_pattern.match(text_tokens[i + 1]):
                                frac = text_tokens[i + 1]
                                text_tokens_after_rules += [
                                    self.numbers_rule(token)[0] + f' {frac}'
                                ]
                                text_tokens.remove(frac)

                            else:
                                text_tokens_after_rules += self.numbers_rule(
                                    token)
                        elif token[-1].upper() in ['K', 'M', 'B']:
                            zeros_dict = {
                                'K': '0' * 3,
                                'M': '0' * 6,
                                'B': '0' * 9
                            }
                            multiplier = token[-1]
                            text_tokens_after_rules += self.numbers_rule(
                                token[:-1] + zeros_dict[multiplier.upper()])
                        elif token[-2:].upper() in ['BN']:
                            zeros_dict = {'BN': '0' * 9}
                            multiplier = token[-2:]
                            text_tokens_after_rules += self.numbers_rule(
                                token[:-2] + zeros_dict[multiplier.upper()])
                    else:
                        text_tokens_after_rules += self.numbers_rule(token)
                else:
                    text_tokens_after_rules += self.stemming_rule([token])

            else:
                text_tokens_after_rules += self.stemming_rule([token])

        text_tokens_after_rules = [
            w for w in text_tokens_after_rules if w not in self.stop_words
        ]

        return text_tokens_after_rules

    def hashtag_rule(self, text):
        if '_' in text:
            return text.lower().split('_') + [
                '#' + text.lower().replace('_', '')
            ]

        else:
            splitted = re.sub('([A-Z][a-z]+)', r' \1',
                              re.sub('([A-Z]+)', r' \1', text)).split()
            return [s.lower() for s in splitted] + ['#' + text.lower()]

    def URL_rule(self, text):
        splitted = re.split("[, \-!?:=\n/…]+", text)
        splitted[1:1] = splitted[1].split('.', maxsplit=1)
        splitted.remove(splitted[3])
        url_stopwords = self.stop_words + self.punctuators + [
            'http', 'www', 'https', 'com', 'co', 'twitter', 'status', 'web'
        ]
        without_stopwords = [s for s in splitted[:3] if s not in url_stopwords]
        return without_stopwords

    def numbers_rule(self, text):
        number_str = text.split()[0].replace(',', '')
        if '.' in number_str:
            number = float(number_str)
        else:
            number = int(number_str)
        if number < 10**3:
            return ["{:.3f}".format(number).strip('0').strip('.')]
        elif 10**3 <= number < 10**6:
            return [
                "{:.3f}".format(number / 10**3).strip('0').strip('.') + 'K'
            ]
        elif 10**6 <= number < 10**9:
            return [
                "{:.3f}".format(number / 10**6).strip('0').strip('.') + 'M'
            ]
        else:
            return [
                "{:.3f}".format(number / 10**9).strip('0').strip('.') + 'B'
            ]

    def stemming_rule(self, tokens):
        if self.need_stemming:
            after_tokens = []
            for token in tokens:
                after_tokens.append(self.stemmer.stem_term(token))
            return after_tokens
        else:
            return tokens

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """

        self.num_of_docs += 1

        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        retweet_text = doc_as_list[4]
        retweet_url = doc_as_list[6]
        quote_text = doc_as_list[6]
        quote_url = doc_as_list[7]
        term_dict = {}
        urls = json.loads(url)
        tokenized_text = self.parse_sentence(full_text, urls)
        parsed_text = [
            tok for tok in tokenized_text
            if tok not in self.stop_words + self.punctuators
        ]

        doc_length = len(
            parsed_text
        )  # after text operations. TODO: check if before parsing gives better results
        self.total_doc_length += doc_length

        if retweet_url is not None:
            # print(retweet_url)
            tid_ptrn = re.compile('\d{7,}')
            # for url in retweet_url.values():
            s = tid_ptrn.search(retweet_url)
            if s is not None:
                tid = retweet_url[s.start():s.end()]
                if tid not in self.retweet_dict:
                    self.retweet_dict[tid] = 1
                else:
                    self.retweet_dict[tid] += 1

        for term in parsed_text:
            if term not in term_dict.keys():
                if term[:1].isupper():
                    term_dict[term.upper()] = 1
                else:
                    term_dict[term.lower()] = 1
            else:
                if term[:1].isupper():
                    term_dict[term.upper()] += 1
                else:
                    term_dict[term.lower()] += 1

        for term in [key for key in term_dict.keys() if key.islower()]:
            if term.upper() in term_dict.keys():
                term_dict[term] += term_dict.pop(term.upper())

        # if self.num_of_docs % self.group_size == 0:
        #     self.write_file()

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length)
        return document

    def check_capital(self, token):
        if len(token.split()) > 1:
            for word in token.split():
                if word.lower() not in self.caps_dict.keys():
                    self.caps_dict[word.lower()] = word[0].islower()
                else:
                    if word[0].islower():
                        self.caps_dict[word.lower()] = True
Пример #10
0
class Parse:
    def __init__(self, stemming=None):
        """
        constructor for this class
        :param stemming:
        """
        self.stop_words = stopwords.words('english')
        self.stemmer = None
        if stemming:
            self.stemmer = Stemmer()
        self.corona_list = [
            "SARS", "sars", "Severe Acute Respiratory Syndrome",
            "severe acute respiratory syndrome", "SARS-CoV", "SARS CoV",
            "sars-cov", "sars cov", "coronavirus", "corona virus", "COVID",
            "covid", "Covid", "COVID-19", "covid-19", "#coronavirus",
            "COVID__19", "#COVID", "#COVID-19", "#covid19", "#SARS"
        ]

    def get_list_without_stopwords(self, list):
        """

        :param list:
        :return: list without stopwords
        """
        list_without_stopwords = []
        stop_words = stopwords.words('english')
        for w in list:
            if not w.lower() in stop_words:
                list_without_stopwords.append(w)
        return list_without_stopwords

    def check_If_Upper_More_Then_Lower(self, text):
        """
        This function check  the ratio of lower and upper case in a string
        :param text:
        :return: true ro false
        """
        if len(text) > 0:
            count = 0
            i = 0
            while i < len(text):
                if text[i].islower():
                    count = count + 1
                i = i + 1
        len1 = len(text)
        if len1 > 0:
            return count / len(text) < 0.5
        else:
            return False

    def upperToLowerAfterDot(self, list, index, new_tokens):
        """
        Convert word that appear after dot or : in text
        :param list:
        :param index:
        :param new_tokens:
        :return:
        """
        if len(list) > index + 1:  # term term . &
            if len(list) > index + 2:
                if list[index + 1].isalpha() and not list[index + 2].isupper():
                    new_tokens.append(list[index + 1].lower())
                    list[index + 1] = ""

    def Hashtags(self, list, index, new_tokens):
        """
        This function get "@" and concat this term to the next term
        :param list:
        :param index:
        :param new_tokens:
        :return:
        """
        if len(list) >= index + 1:
            word = list[index + 1]
            list[index + 1] = ""
            if "_" in word:
                words = word.rsplit("_")
            else:
                word = re.sub('([a-zA-Z])', lambda x: x.groups()[0].upper(),
                              word, 1)
                words = re.findall('[A-Z][^A-Z]*', word)
            new_word = ""
            i = 0
            while i < len(words):
                new_tokens.append(words[i].lower())
                new_word = new_word + words[i].lower()
                i += 1
            new_tokens.append("#" + new_word)

    def tags(self, list, index, new_tokens):
        """
        This function separate the string on each time appear upper letter in
        the string to each time appears "_" to different terms
        :param list:
        :param index:
        :param new_tokens:
        :return:
        """
        new_word = "@" + list[index + 1]
        new_tokens.append(new_word)
        new_tokens.append(list[index + 1].lower())
        list[index + 1] = ''

    def extractUrl(self, list, index):
        """
        Thos function separate the url to terms
        :param list:
        :param index:
        :return:
        """
        word = list[index]
        tokenize_list_url = re.compile(r'[\:/?=\-&]+', re.UNICODE).split(word)
        if len(tokenize_list_url) > 1:
            url = tokenize_list_url[1]
            if 'www.' in url:
                url2 = url.replace('www.', '')
                tokenize_list_url.append(url2)
        list.extend(tokenize_list_url)

    def handel_percent(self, list, index, new_tokens):
        """
        This function convert "percentage" or "percent" to % and
        concat the term which appears before the %
        :param list:
        :param index:
        :param new_tokens:
        :return:
        """
        if not list[index - 1].isalpha():
            num = list[index - 1]
            new_word = num + "%"
            if index - 1 < len(list):
                if list[index - 1] in new_tokens:
                    new_tokens.remove(list[index - 1])
            new_tokens.append(new_word)

    def convertNumbersUnits(self, list, index, new_tokens):
        """
        This function convert the units of number
        :param list:
        :param index:
        :param new_tokens:
        :return:
        """
        numeric_list = WordsToNumber().getNumericWords()
        if index + 1 < len(list) and list[index + 1].lower() in numeric_list:
            num = float(list[index])
            numericNum = float(WordsToNumber().execute(list[index + 1]))
            new_Num = str(num * numericNum)
            new_word = WordsToNumber().handle_number(new_Num)
            list[index] = ''
            list[index + 1] = ''
            new_tokens.append(new_word)
        elif float(list[index]) >= 1000:
            new_word = WordsToNumber().handle_number(str(list[index]))
            list[index] = ''
            new_tokens.append(new_word)
        elif self.isFraction(list, index + 1):
            if "." not in list[index]:
                new_word = list[index] + " " + list[index + 1]
                list[index + 1] = ''
            else:
                new_word = list[index]
            new_tokens.append(new_word)
        else:
            new_tokens.append(list[index])

    def combainCapitalTerms(self, text_tokens):
        """
        This function concat two or more term which appears with capital letters one after one
        :param text_tokens:
        :return:
        """
        for index, word in enumerate(text_tokens):
            if len(word) > 0:
                if word[0].isupper():
                    try:
                        list_ca = self.capitalettersTerms(text_tokens, index)
                        text_tokens = text_tokens + list_ca
                    except:
                        print("Could not connect terms")
            if index == 3:
                break
        return text_tokens

    def capitalettersTerms(self, list, index):
        result = []
        i = 0
        word = list[index]
        if word[0].isupper():
            new_word = word
            i = index
            if i + 1 < len(list):
                i = i + 1
                loop = 1
                while list[i][0].isupper() and index + 1 == i and loop > 5:
                    loop += 1
                    new_word = new_word + " " + list[i]
                    index += 1
                    if i + 1 < len(list):
                        i += 1
                if not new_word in list:
                    result.insert(index, new_word)
            else:
                if list[index][0].isupper() and not new_word in list:
                    result.insert(index, list[index])
        else:
            i += 1
        return result

    def remove_emoji(self, string):
        """
        This function remove emoji from text
        :param string:
        :return:
        """
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
            "]+",
            flags=re.UNICODE)
        return emoji_pattern.sub(r'', string)

    def isFraction(self, list, index):
        """
        This function checke whether the word is a fraction or not
        :param list:
        :param index:
        :return:
        """
        word = list[index]
        if "/" in word:
            word = word.replace("/", "")
            if word.isnumeric():
                return True
            else:
                return False
        elif "." in word:
            word = word.replace(".", "")
            if word.isnumeric():
                return True
            else:
                return False

    def isNumber(self, list, index):
        """
        This function checke whether the word is a number or not
        :param list:
        :param index:
        :return:
        """
        word = list[index]
        if "," in word:
            word = word.replace(",", "")
            if word.isnumeric():
                list[index] = word
                return True
            else:
                return False
        elif "." in word and word.count(".") == 1:
            word = word.replace(".", "")
            if word.isnumeric():
                return True
        else:
            return str(list[index]).isnumeric()

    def handle_dashes(self, list, index, new_tokens):
        """
        This function separate the term by "-"
        :param list:
        :param index:
        :param new_tokens:
        :return:
        """
        dash_idx = list[index].find('-')
        if self.stemmer is None:
            new_tokens.append(list[index].lower())
            new_tokens.append(list[index][:dash_idx].lower())
            new_tokens.append(list[index][dash_idx + 1:].lower())
        else:
            new_tokens.append(self.stemmer.stem_term(list[index].lower()))
            new_tokens.append(
                self.stemmer.stem_term(list[index][:dash_idx].lower()))
            new_tokens.append(
                self.stemmer.stem_term(list[index][dash_idx + 1:].lower()))
        if list[index] in self.corona_list:
            new_tokens.append("corona")

    def parse_sentence(self, text):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """
        new_capital_words = set()
        temp_entitie = ''
        will_merge = 0
        capital_dict = {}
        entities_set = set()

        #text = self.remove_emoji(text)     ****************************************
        #if self.cheak_If_Upper_More_Then_Lower(text): ************************
        #    text = text.lower()                      ******************************

        text_tokens = word_tokenize(text)
        try:
            url = ""
            if "http" in text:
                url = re.search("(?P<url>https?://[^\s]+)", text).group("url")
                if len(url) > 0:
                    text = text.replace(url, "")
                    text_tokens = word_tokenize(text)
        except:
            pass

        #text_tokens = self.get_list_without_stopwords(text_tokens) *******************************
        new_tokens = []
        # text_tokens_without_stopwords = [w.lower() for w in text_tokens if w not in self.stop_words]
        for index, word in enumerate(text_tokens):
            if word == "" or word == " " or word.lower(
            ) in self.stop_words or word.lower().endswith("'s") or (
                    len(word) == 1 and ord(word)) > 126:
                continue
            # ------------------------------------------------------------------------ upper to lower
            elif word == "." or word == ":":
                self.upperToLowerAfterDot(text_tokens, index, new_tokens)
            #  -------------------------------------------------------------------------- HashTAG
            elif word == "#" and index <= len(text_tokens) - 2:
                self.Hashtags(text_tokens, index, new_tokens)
            #   ----------------------------------------------------------------------------  Tags
            elif word == "@" and index <= len(text_tokens) - 2:
                self.tags(text_tokens, index, new_tokens)
            #   ------------------------------------------------------------------------  percent %
            elif word == "percent" or word == "percentage" or word == '%':
                self.handel_percent(text_tokens, index, new_tokens)
            #   -------------------------------------------------------------------------- Dollars $ "the number is 80 $ and nata $"
            elif word == "$":
                new_tokens.append("dollars")
            #   ------------------------------------------------------------------------- 3 miliom ex
            elif not word.isalpha():
                if self.isNumber(text_tokens, index) or word.isnumeric():
                    try:
                        self.convertNumbersUnits(text_tokens, index,
                                                 new_tokens)
                    except:
                        pass
                # ---------------------------------------------------------------- split the word by the dashes
                elif '-' in word and len(word) > 1:
                    self.handle_dashes(text_tokens, index, new_tokens)
            # -------------------------------------------------------------
            elif word in self.corona_list:
                new_tokens.extend([word, "corona"])
            # ------------------------------------------------- Otherwise, if it's just a normal word add it
            elif word.isalpha() or word.isnumeric():
                if self.stemmer is not None:
                    word = self.stemmer.stem_term(word)
                new_tokens.append(word)
            # ------------------------------------------------- chaning two or more upper words to one term
            if len(word) > 0 and word[0].isupper():
                # chunks entities together.
                temp_entitie += word + " "
                will_merge += 1
            else:
                # add entity to the global counter and to the current words set
                if temp_entitie != '':
                    n = temp_entitie[:
                                     -1]  # delete the space " " apter the capital term
                    entities_set.add(n)
                    if will_merge > 1:
                        new_capital_words.add(temp_entitie)
                    temp_entitie = ''
                    will_merge = 0

            if len(word) > 0 and word[0].isupper():
                if word not in capital_dict:
                    capital_dict[word.lower()] = True
            else:
                capital_dict[word.lower()] = False

        if len(url) > 0:
            list = []
            list.append(url)
            self.extractUrl(list, 0)
            new_tokens.extend(list)

        return new_tokens, capital_dict, entities_set

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """
        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        retweet_text = doc_as_list[5]
        retweet_url = doc_as_list[6]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        retweet_quoted_text = doc_as_list[11]

        if quote_text is not None:
            full_text = full_text + " " + quote_text
        if retweet_quoted_text is not None:
            full_text = full_text + " " + retweet_quoted_text
        #if retweet_text is not None:
        #    full_text = full_text + " " + retweet_text

        # clean latin letters
        full_text = re.sub(
            re.compile(
                pattern=
                r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]'
            ), u'', full_text)
        term_dict = {}
        tokenized_text, capital_dict, entities_set = self.parse_sentence(
            full_text)

        doc_length = len(tokenized_text)  # after text operations.

        max_tf = 0
        for idx, term in enumerate(tokenized_text):
            if term not in term_dict.keys():
                term_dict[term] = [idx]
            else:
                term_dict[term].append(idx)
                max_tf = max(len(term_dict[term]), max_tf)

        unique_terms_in_doc = len(term_dict)
        are_rt = 0
        if full_text.find("rt") == 0 or full_text.find("RT") == 0:
            are_rt = 1

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length, max_tf, unique_terms_in_doc, are_rt,
                            capital_dict, entities_set)
        return document
Пример #11
0
    def parser_rules(self, token_text, stemming=False):
        rmv = []
        add = []
        url_stop = [
            "/", "\\", "-", "=", '%', "'", " ", ":", "`", '``', '_', '"',
            "...", '``', "''", "www."
        ]
        delimiters = '|'.join(map(re.escape, url_stop))
        all_delimiters = '|'.join(map(re.escape, url_stop + ["."]))
        nameOrEntity = ""
        counterOfCapitalInARow = 0

        for i, token in enumerate(token_text):

            if token in self.stop_words or token.lower(
            ) in self.stop_words or token in url_stop:
                rmv.append(token)
                continue
            # Check for unwanted chars like : . ; , / etc
            if len(token) == 1 and token not in ["@", "#", "$", "%"]:
                # if ord(token_text[i]) > 122 or 90 < ord(token_text[i]) < 97 or 57 < ord(token_text[i]) < 64 or 37 < ord(token_text[i]) < 48 or 31 < ord(token_text[i]) < 35:
                rmv.append(token_text[i])
                continue
            # Remove unwanted expressions
            if token.__contains__("t.co") or token.__contains__(
                    "http") or token.lower() == "rt" or token.__contains__(
                        "twitter.com"):
                rmv.append(token)
                continue
            # url detector
            if token.__contains__("//"):
                token_url = [
                    t for t in re.split(delimiters, token) if (len(t) > 1)
                ]

                rmv.append(token)
                add += token_url
                continue

            # Check if it is a tag
            if token_text[i] == "@" and i < len(token_text) - 1:
                token_text[i] = token_text[i] + token_text[i + 1]
                rmv.append(token_text[i + 1])
                continue
            # Check if it is a hashtag and analyze the hashtag to words according to Upper letters
            if token_text[i] == "#" and i < len(token_text) - 1:
                token_text[i] = token_text[i] + token_text[i + 1]
                rmv.append(token_text[i + 1])
                add = self.word_cutter(add, url_stop, token_text[i + 1])
                continue
            # Turn every context of dollars to the word dollar
            if token.lower() in ["$", "dollars"]:
                token_text[i] = "dollar"
                continue

            # Turn every context of percentage to %
            if self.is_real_number(token_text[i]) and i < len(token_text) - 1:
                if token_text[i + 1].lower() in ["%", "percentage", "percent"]:
                    token_text[i] = token_text[i] + "%"
                    rmv.append(token_text[i + 1])
                    continue

            # Names and Entities - will be 2 or 3 tokens
            if token_text[i][0].isupper(
            ) and counterOfCapitalInARow < 3 and not token_text[i].isnumeric():
                nameOrEntity = nameOrEntity + " " + token_text[i]
                # delete space in the beginning
                if counterOfCapitalInARow == 0:
                    nameOrEntity = nameOrEntity[1:len(nameOrEntity)]
                counterOfCapitalInARow += 1
            elif 1 < counterOfCapitalInARow < 4:  # add to the right set - number of times that the entity exists so far

                add.append(nameOrEntity.upper())
                nameOrEntity = ""
                counterOfCapitalInARow = 0
            else:
                nameOrEntity = ""
                counterOfCapitalInARow = 0

            # Check if it is a big number
            if self.is_real_number_comma(token_text[i]):
                try:
                    # Convert to float and int
                    convertedNumToFloat = float(token_text[i].replace(',', ''))
                    convertedToInt = int(convertedNumToFloat)
                    # The final number
                    if convertedToInt == convertedNumToFloat:
                        finalNumber = convertedToInt
                    else:
                        finalNumber = convertedNumToFloat

                    # Check if the next token is thousand, million, billion or fraction
                    if finalNumber < 1000:
                        if i < len(token_text) - 1 and token_text[i + 1] in [
                                "Thousand", "thousand", "Thousands",
                                "thousands"
                        ]:
                            convertedToString = str(finalNumber) + "K"

                        elif i < len(token_text) - 1 and token_text[i + 1] in [
                                "Million", "million", "Millions", "millions"
                        ]:
                            convertedToString = str(finalNumber) + "M"

                        elif i < len(token_text) - 1 and token_text[i + 1] in [
                                "Billion", "billion", "Billions", "billions"
                        ]:
                            convertedToString = str(finalNumber) + "B"

                        # if the next token is fraction then connect them
                        elif i + 1 < len(token_text) and self.is_fraction(
                                token_text[i + 1]):
                            convertedToString = token_text[
                                i] + " " + token_text[i + 1]
                        else:
                            continue

                        # Add to lists
                        add.append(convertedToString)
                        rmv.append(token_text[i])
                        rmv.append(token_text[i + 1])

                    # if it is a thousand number
                    elif 999 < convertedToInt < 999999:
                        finalNumber /= 1000

                        # After division need to save again 1 or 1.0 for example
                        convertedNumToFloat = float(finalNumber)
                        convertedToInt = int(convertedNumToFloat)
                        if convertedToInt == convertedNumToFloat:
                            finalNumber = convertedToInt
                        else:
                            finalNumber = convertedNumToFloat
                            finalNumber = self.round_down(finalNumber)

                        convertedToString = str(finalNumber) + "K"

                        # Add to lists
                        add.append(convertedToString)
                        rmv.append(token_text[i])

                    # if it is a Million number
                    elif 999999 < convertedToInt <= 999999999:
                        finalNumber /= 1000000

                        # After division need to save again 1 or 1.0 for example
                        convertedNumToFloat = float(finalNumber)
                        convertedToInt = int(convertedNumToFloat)
                        if convertedToInt == convertedNumToFloat:
                            finalNumber = convertedToInt
                        else:
                            finalNumber = convertedNumToFloat
                            finalNumber = self.round_down(finalNumber)

                        convertedToString = str(finalNumber) + "M"

                        # Add to lists
                        add.append(convertedToString)
                        rmv.append(token_text[i])

                    # if it is a Billion number
                    elif 9999999 < convertedToInt:
                        finalNumber /= 1000000000

                        # After division need to save again 1 or 1.0 for example
                        convertedNumToFloat = float(finalNumber)
                        convertedToInt = int(convertedNumToFloat)
                        if convertedToInt == convertedNumToFloat:
                            finalNumber = convertedToInt
                        else:
                            finalNumber = convertedNumToFloat
                            finalNumber = self.round_down(finalNumber)

                        convertedToString = str(finalNumber) + "B"
                        # Add to lists
                        add.append(convertedToString)
                        rmv.append(token_text[i])
                except:
                    continue

            # Split words that will mean something after splitting
            if any(one_char in url_stop + ["."] for one_char in token):
                # print(token_text[i])

                token_url = [
                    t for t in re.split(all_delimiters, token) if (len(t) > 1)
                ]
                rmv.append(token)
                add += token_url
                continue

        for w in rmv:
            if w in token_text:
                token_text.remove(w)
        for w2 in add:
            if w2 == "" or w2 in url_stop:
                continue
            token_text.append(w2)
        # Stem if asked
        if stemming:
            s = Stemmer()
            for i, token in enumerate(token_text):
                if self.first_alfa_upper(token):
                    token_text[i] = s.stem_term(token).upper()
                else:
                    token_text[i] = s.stem_term(token)

        return token_text
Пример #12
0
class Parse:
    def __init__(self, stemming=0):
        """
         This function initiate the fields of Parse, init the stemmer and entering stop words
         :param stemming: the boolean value is stem is needed (optional)
         """
        self.stemming = stemming
        self.stemmer = Stemmer()

        # self.stop_words = frozenset(stopwords.words('english')) ??????????????????????????????????????????????????????
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            ':', '\'s', '.', ',', ';', '’', '?', '!', 'rt', '-', '|', '~', '(',
            ')', '*', '+', '='
            '/', '"', '``', '\'\'', '\n', '\n\n', '&', 'amp', '…', '\'', '`',
            '[', ']', '{', '}'
        ])

    def find_url(self, text):
        """
        This function finds the url addresses in the text (with valid conditions for urls in string)
        :param text: the full text of the tweet
        :return: list of all urls in the text
        """
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        urls = re.findall(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            text)
        return urls

    def tokenize_urls(self, urls):
        """
        This function tokenize the url addresses in the text
        :param urls: the list of urls in the text
        :return: list of tokenized words from all urls
        """
        url_tokens = []
        tokens_ans = []
        for url in urls:
            if 't.co' not in url:
                url_tokens.extend(re.split(r';|-|/|//|:|=|\?', url))

        for token in url_tokens:
            if token == 'https' or token == 'http' or token == '':
                continue
            elif 'www.' in token:
                tokens_ans.append(token.replace('www.', ''))
            # remove garbage (like aH3cdR5ouY)
            elif token.islower() or token.isdigit():
                tokens_ans.append(token)
        return tokens_ans

    def number_3digits(self, number):
        """
        This function change the format of the number to 3 digits after the point
        :param number: the number is int/float
        :return: the number in the format (with 3 digits after the point)
        """
        return "{:.4g}".format(number)

    def number_size(self, w, i, text_tokens_list):
        """
        This function checks if the word is number between some range
        :param w: the word is string
        :param i: the index is integer
        :param text_tokens_list: this is a list of tokens
        :return: the word in the range
        """
        number = int(w) if w.isdigit() else float(w)
        # Number is in thousand range
        if 1000 <= number < 1000000:
            number = number / 1000
            w = self.number_3digits(number) + "K"
        # Number is in million range
        elif 1000000 <= number < 1000000000:
            number = number / 1000000
            w = self.number_3digits(number) + "M"
        # Number is in billion range or more
        elif 1000000000 <= number:
            number = number / 1000000000
            w = self.number_3digits(number) + "B"
        # Number is in hundred range or less
        else:
            w = self.number_3digits(number)

            # Thousand
            if i + 1 < len(text_tokens_list) and (
                    text_tokens_list[i + 1] == 'Thousand'
                    or text_tokens_list[i + 1] == 'thousand'):
                text_tokens_list[i + 1] = 'K'
                text_tokens_list[i:(i + 2)] = [
                    ''.join(text_tokens_list[i:(i + 2)])
                ]
                w = text_tokens_list[i]

            # Million
            elif i + 1 < len(text_tokens_list) and (
                    text_tokens_list[i + 1] == 'Million'
                    or text_tokens_list[i + 1] == 'million'):
                text_tokens_list[i + 1] = 'M'
                text_tokens_list[i:(i + 2)] = [
                    ''.join(text_tokens_list[i:(i + 2)])
                ]
                w = text_tokens_list[i]

            # Billion
            elif i + 1 < len(text_tokens_list) and (
                    text_tokens_list[i + 1] == 'Billion'
                    or text_tokens_list[i + 1] == 'billion'):
                text_tokens_list[i + 1] = 'B'
                text_tokens_list[i:(i + 2)] = [
                    ''.join(text_tokens_list[i:(i + 2)])
                ]
                w = text_tokens_list[i]

            # Fraction after the number
            elif i + 1 < len(text_tokens_list) and bool(
                    re.search(r'^-?[0-9]+\/[0-9]+$', text_tokens_list[i + 1])):
                text_tokens_list[i:(i + 2)] = [
                    ' '.join(text_tokens_list[i:(i + 2)])
                ]
                w = text_tokens_list[i]

        return w

    def get_entity(self, text):
        """
        This function finds the entities in the text (two or more words in sequence that starts with capital letter)
        :param text: the full text of the tweet
        :return: list of all entities in the text
        """
        entities = re.findall(
            r'^[A-Z][a-z]+(?: [A-Z][a-z]+)+| [A-Z][a-z]+(?: [A-Z][a-z]+)+',
            text)
        for i, entity in enumerate(entities):
            entities[i] = entity.upper()
            if entity[0] == ' ':
                entities[i] = entities[i][1:]
        return entities

    def parse_sentence(self, text):
        """
        This function tokenize, remove stop words and apply parser rules for every word within the text
        :param text: the full text of the tweet
        :return: list of tokenized words
        """
        full_text = text
        # Extract the urls from the text and tokenize them separately
        urls = self.find_url(text)
        tokenized_urls = []
        if len(urls) != 0:
            tokenized_urls = self.tokenize_urls(urls)
        for url in urls:
            text = text.replace(url, '')

        # Tokenize the text- remove all characters that not ascii,
        # then split the words in the text by punctuation marks,
        # and finally clear all white spaces
        text = re.sub(r'[^\x00-\x7F]+', ',', text)
        text_tokens = re.split(
            r'([^a-zA-Z0-9_]|[0-9]*/[0-9]*|[0-9]*,[0-9]*,[0-9]*,[0-9]*|[0-9]*,[0-9]*,[0-9]*|[0-9]*,[0-9]*)',
            text)  # \W
        text_tokens = list(filter((' ').__ne__, text_tokens))
        text_tokens = list(filter(('').__ne__, text_tokens))

        # Loops on the tokens list
        i = 0
        while i < len(text_tokens):
            w = text_tokens[i]
            # Check if the is stop word- delete her
            if (w.lower() in self.stop_words) or (w in self.stop_words):
                del text_tokens[i]
                continue
            else:
                # Find parser rules
                # (Upper case) - if first letter is capital -> all word is uppercase
                if len(w) > 1 and w[0].isupper():
                    text_tokens[i] = w = w.upper()
                # (@) - if the word is @ and after there is a word -> union those tokens
                elif w == '@' and i < (len(text_tokens) - 1):
                    text_tokens[i:(i + 2)] = [''.join(text_tokens[i:(i + 2)])]
                # (#) - if the word is # and after there is a word -> union those tokens (there are more rules here)
                elif w == '#' and i < (len(text_tokens) - 1) and (
                        text_tokens[i + 1] == ','
                        or text_tokens[i + 1] == '#'):
                    del text_tokens[i]
                    del text_tokens[i]
                    continue
                elif w == '#' and i < (len(text_tokens) -
                                       1) and text_tokens[i + 1] != ',':
                    hashword = text_tokens[i + 1]
                    text_tokens[i:(i + 2)] = [
                        ''.join(text_tokens[i:(i + 2)]).lower().replace(
                            '_', '')
                    ]
                    separate = hashword.split('_')
                    # in case the words are not separated by _ (like: #home)
                    if len(separate) == 1:
                        # in case the hashtag is all lower case
                        if separate[0].islower():
                            text_tokens.insert(i, hashword)
                            continue

                        separate = re.findall('[A-Z][^A-Z]*', separate[0])

                    # new rule: hashtag with sequenced capital letter will be merged to one term (like: #WhereIsKCR)
                    for index, word in enumerate(separate):
                        if len(word) == 1 and word.isupper():
                            j = index + 1
                            while j < len(separate) and len(separate[j]) == 1:
                                j += 1
                            separate[index:(j + 1)] = [
                                ''.join(separate[index:(j + 1)])
                            ]

                    # Add the separated words from the hashtag to the tokens list
                    for word in reversed(separate):
                        if len(word) > 0:
                            text_tokens.insert(i, word.lower())

                # Numbers
                elif w.isdigit() or w.replace(',', '').isdigit():
                    # Remove ,
                    text_tokens[i] = w = w.replace(',', '')

                    # .
                    if (i + 1) < len(text_tokens) and text_tokens[
                            i + 1] == '.' and (i + 2) < len(
                                text_tokens) and text_tokens[i + 2].isdigit():
                        text_tokens[i:(i +
                                       3)] = [''.join(text_tokens[i:(i + 3)])]
                        w = text_tokens[i]

                    # Number%
                    if (i + 1) < len(text_tokens) and text_tokens[i +
                                                                  1] == '%':
                        text_tokens[i] = self.number_3digits(
                            float(text_tokens[i]))
                        text_tokens[i:(i +
                                       2)] = [''.join(text_tokens[i:(i + 2)])]
                        i += 1
                        continue

                    # Number percent/percentage -> Number%
                    elif (i + 1) < len(text_tokens) and \
                            (text_tokens[i + 1] == 'percent' or text_tokens[i + 1] == 'percentage'):
                        text_tokens[i] = self.number_3digits(
                            float(text_tokens[i]))
                        text_tokens[i + 1] = '%'
                        text_tokens[i:(i +
                                       2)] = [''.join(text_tokens[i:(i + 2)])]
                        i += 1
                        continue

                    # Other numbers- check ranges
                    text_tokens[i] = w = self.number_size(w, i, text_tokens)

                    # new rule: $Number will be merged to one term
                    if i > 0 and text_tokens[i - 1] == '$':
                        text_tokens[(i - 1):(i + 1)] = [
                            ''.join(text_tokens[(i - 1):(i + 1)])
                        ]
                        continue
            i += 1

        # stem terms if needed
        if self.stemming:
            for j, term in enumerate(text_tokens):
                if text_tokens[j][0] != '#' and text_tokens[j][0] != '@':
                    text_tokens[j] = self.stemmer.stem_term(term)
        text_tokens += tokenized_urls
        return text_tokens

    # cant change the function signature
    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-presenting the tweet.
        :return: Document object with corresponding fields.
        """
        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        retweet_text = doc_as_list[5]
        retweet_url = doc_as_list[6]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]

        entity_list = dict(Counter(self.get_entity(full_text)))

        # Change the short url in text to the full url (if exist in url dictionary), and send to parse_sentence
        j = json.loads(url)
        text = full_text
        for short in j:
            if j[short] is not None:
                text = text.replace(short, j[short])
        tokenized_text = self.parse_sentence(text)
        tokenized_text = list(filter(('').__ne__, tokenized_text))

        doc_length = len(tokenized_text)  # after text operations.

        term_dict = dict(Counter(tokenized_text))

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length, entity_list)
        return document

    def parse_query(self, query):
        list_tokens = self.get_entity(query)
        list_tokens += self.parse_sentence(query)
        dict_tokens = dict(Counter(list_tokens))
        return dict_tokens
Пример #13
0
class Parse:
    def __init__(self, stemming):
        self.stop_words = stopwords.words('english')
        self.stop_words += ["rt", "http", "https", "www",
                            "twitter.com"]  # TODO: check &amp
        self.terms = set()
        self.nonstopwords = 0
        self.max_tf = 0
        self.toStem = stemming
        self.entities = {}
        if self.toStem:
            self.stemmer = Stemmer()

    def parse_sentence(self, text):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """
        term_dict = {}
        entity_dict = {}
        # Entity recognition by capital letters (2 words or more)
        for entity in re.findall(ENTITY_PATTERN, text):
            cleaned_entity = re.sub("-", " ", entity).upper()
            entity_dict[cleaned_entity] = entity_dict.get(cleaned_entity,
                                                          0) + 1

        text_tokens = re.findall(TOKENIZER_PATTERN, text)
        indices_counter = 0
        for term in text_tokens:
            if len(term) < 1: continue
            indices_counter += 1
            if term[0] == "#":  # handle hashtags
                hashtag_list = self.hashtag_parser(term)
                for mini_term in hashtag_list:
                    self.dictAppender(term_dict, indices_counter, mini_term)
            elif term[0] == "@":  # handle tags
                no_tag = self.tags_parser(term)
                self.dictAppender(term_dict, indices_counter, no_tag)
            elif term in contractions:  # remove things like he'll
                new_terms = contractions[term].split(" ")
                for mini_term in new_terms:
                    self.dictAppender(term_dict, indices_counter, mini_term)
                    indices_counter += 1
                indices_counter -= 1
                continue
            self.dictAppender(term_dict, indices_counter, term)

        return term_dict, indices_counter, entity_dict

    def split_url(self, url):
        url_list = list(filter(None, re.split(SPLIT_URL_PATTERN, url)))
        return url_list

    def remove_percent_dollar(self, text):
        no_dollar = re.sub(DOLLAR_PATTERN, "$", text)
        return re.sub(PERCENT_PATTERN, "%", no_dollar)

    def num_manipulation(self, num):
        num = re.sub(BILLION_PATTERN, "B", num)
        num = re.sub(MILLION_PATTERN, "M", num)
        num = re.sub(THOUSAND_PATTERN, "K", num)
        num = re.sub(BILLION_PATTERN_NUM, r'\1.\3B', num)
        num = re.sub(MILLION_PATTERN_NUM, r'\1.\3M', num)
        num = re.sub(THOUSAND_PATTERN_NUM, r'\1.\3K', num)
        num = re.sub(GENERAL_PATTERN, r'\1.\2\3\5', num)
        return re.sub(DECIMAL_PATTERN, r'\1\3', num)

    def url_parser(self, url):
        """
        :param url: recieves a string based dictionary of all urls
        :return: dictionary with parsed urls
        """
        if len(url) <= 2:  #url list is not empty
            return []
        url_list = re.findall(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            url[1:-1])

        finalList = []
        for val in url_list:
            if 'twitter.com/i/web/status/' in val or 't.co' in val:
                continue
            val = re.sub(TWITTER_STATUS_PATTERN, r'\2', val)
            finalList = self.split_url(val)
        return finalList

    def hashtag_parser(self, hashtag):
        splitted_hashtag = list(
            map(
                lambda x: x.lower(),
                filter(lambda x: len(x) > 0, re.split(HASHTAG_PATTERN,
                                                      hashtag))))
        if len(splitted_hashtag) < 2:
            return splitted_hashtag
        else:
            return splitted_hashtag[1:] + [hashtag.lower()]

    def tags_parser(self, tag):
        return tag[1:]

    def dictAppender(self, d, counter, term):
        # Handling Stemming
        if self.toStem:
            stemmed_word = self.stemmer.stem_term(term)
            if not term.islower():
                term = stemmed_word.upper()
            else:
                term = stemmed_word

        # Handling upper & lower cases per document
        term_lower = term.lower()
        if not all(ord(c) < 128 for c in term): return
        if term_lower in self.stop_words: return
        term_upper = term.upper()

        if not term.islower():  # upper
            term = term_upper
            if term_lower in self.terms:
                term = term_lower
        elif term_upper in self.terms:  # lower
            self.terms.remove(term_upper)
            upper_list = d[term_upper]
            d.pop(term_upper)
            d[term_lower] = upper_list
        self.terms.add(term)

        # Creating indices list
        self.nonstopwords += 1
        tmp_lst = d.get(term, [])
        tmp_lst.append(counter)
        d[term] = tmp_lst
        if self.max_tf < len(tmp_lst):
            self.max_tf = len(tmp_lst)

    def parse_doc(self, doc_as_list):  # Do NOT change signature
        """doc_as_list[3]
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """
        # Get relevant information from tweet
        tweet_id = doc_as_list[0]
        full_text = doc_as_list[2]
        docText = full_text
        url = doc_as_list[3]
        quote_text = doc_as_list[8]
        if quote_text:
            docText += quote_text

        self.nonstopwords = 0
        self.max_tf = 0

        docText = re.sub(REMOVE_URL_PATTERN, "",
                         docText)  # link (urls) removal from fulltext
        docText = self.num_manipulation(docText)
        docText = self.remove_percent_dollar(docText)

        tokenized_dict, indices_counter, entity_dict = self.parse_sentence(
            docText)
        urlTermList = self.url_parser(url)
        for term in urlTermList:
            indices_counter += 1
            self.dictAppender(tokenized_dict, indices_counter, term)

        doc_length = self.nonstopwords  # after text operations.

        document = Document(tweet_id,
                            term_doc_dictionary=tokenized_dict,
                            doc_length=doc_length,
                            max_tf=self.max_tf,
                            entities_dict=entity_dict)
        return document

    def parse_query(self, query):  # return {term: ([indices,tf])}
        self.nonstopwords = 0
        self.max_tf = 0
        docText = self.num_manipulation(query)
        docText = self.remove_percent_dollar(docText)

        tokenized_dict, indices_counter, entity_dict = self.parse_sentence(
            docText)
        return tokenized_dict, entity_dict

    def remove_stopwords(self, query):
        text_tokens = re.findall(TOKENIZER_PATTERN, query)
        tokens = list(
            filter(lambda x: x.lower() not in self.stop_words, text_tokens))
        query = ' '.join(tokens)
        return query
Пример #14
0
class Parse:
    THOUSAND = 1000
    MILLION = 1000000
    BILLION = 1000000000
    TRILLION = 1000000000000
    QUANTITIES = {
        'thousand': 'K',
        'thousands': 'K',
        'million': 'M',
        'millions': 'M',
        'billion': 'B',
        'billions': 'B',
        'trillion': 'TR',
        'trillions': 'TR'
    }
    SIGNS = {'$': '$', 'usd': '$'}
    QUANTITIES_LIST = ['K', 'M', 'B', 'TR', 'TRX', 'TRXX']

    def __init__(self, config):
        self.with_stem = config.get_toStem()
        self.stemmer = Stemmer()
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            r' ', r'', r"", r"''", r'""', r'"', r"“", r"”", r"’", r"‘", r"``",
            r"'", r"`", '"'
        ])
        self.stop_words.extend([
            'rt', r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']',
            r'{', '}'
            "'&'", '$', '.', r'\'s', '\'s', '\'d', r'\'d', r'n\'t'
        ])
        self.stop_words.extend(['1️⃣.1️⃣2️⃣'])
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        # for avg
        self.total_len_docs = 0
        self.number_of_documents = 0

        self.url_pattern = re.compile('http\S+')
        self.url_www_pattern = re.compile("[/://?=]")
        # TODO - fix numbers pattern
        self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*'))
        self.non_latin_pattern = re.compile(
            pattern=
            r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]'
        )
        self.dates_pattern = re.compile(
            r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$'
        )
        # TODO - fix emoji to include all emojis
        self.emojis_pattern = re.compile(
            pattern="["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00010000-\U0010ffff"
            u"\U0001f926-\U0001f937"
            u"\U000024C2-\U0001F251"
            u"\U00002702-\U000027B0"
            u"\u2640-\u2642"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"
            u"\u3030"
            u"\u2600-\u2B55"
            u"\uFE0F\u20E3\uFE0F\u20E3\uFE0F\u20E3"
            "]+",
            flags=re.UNICODE)

    def parse_hashtag(self, all_tokens_list, token):
        if len(token) <= 1:
            return

        t = []
        # --> #stay_at_home
        if '_' in token:
            t.append('#' + re.sub(r'_', '', token))
            t += re.split(r'_', token)
        else:
            # --> #stayAtHome
            if not token.isupper():
                t.append('#' + token)
                t += re.findall('[A-Z][^A-Z]*', token)
            # --> #ASD
            else:
                all_tokens_list.append('#' + token)
                return

        t = [x.lower() for x in t]
        all_tokens_list += t

    def parse_numbers(self, all_tokens_list, token, before_token, after_token,
                      text_tokens):
        def helper(num):
            count = -1
            while num >= 1000:
                num /= 1000
                count += 1
            # fixed the case of 140.000K
            if num.is_integer():
                num = int(num)
                return num, count
            return ("%.3f" % num), count

        if '/' in token:
            all_tokens_list.append(token)
            return
        if ',' in token:
            token = token.replace(',', '')

        try:
            token = float(token)
        except:
            # from this type - 10.07.2020
            all_tokens_list.append(token)
            return

        if token.is_integer():
            token = int(token)

        b_tok = None
        is_pers = None

        if before_token and before_token in Parse.SIGNS:
            b_tok = Parse.SIGNS[before_token]

        if after_token:
            after_token = after_token.lower()

            if after_token in Parse.QUANTITIES:

                if token < 1000:
                    if b_tok:
                        all_tokens_list.append(b_tok + str(token) +
                                               Parse.QUANTITIES[after_token])
                        return
                    else:
                        all_tokens_list.append(
                            str(token) + Parse.QUANTITIES[after_token])
                        return
                # if we have after and token > 1000
                num, count = helper(token)
                i = Parse.QUANTITIES_LIST.index(
                    Parse.QUANTITIES[after_token]) + 1

                count = count + i
                if count > 2:
                    count = count - 2
                    while (count > 0):
                        num = float(num) * 1000
                        count -= 1
                    if num.is_integer():
                        num = int(num)
                    all_tokens_list.append(str(num) + 'B')
                    return
                else:
                    after_token = Parse.QUANTITIES_LIST[count]
                    all_tokens_list.append(str(num) + after_token)
                    return

            if after_token == 'percent' or after_token == 'percentage' or after_token == '%':
                is_pers = True

        if token < 1000:
            final_t = str(token)
        else:
            num, count = helper(token)
            try:
                # more then B
                if count > 2:
                    count = count - 2
                    while (count > 0):
                        num = float(num) * 1000
                        count -= 1
                    if num.is_integer():
                        num = int(num)
                    final_t = str(num) + 'B'
                else:
                    after = Parse.QUANTITIES_LIST[count]
                    final_t = str(num) + after
            except:
                pass
        if b_tok:
            all_tokens_list.append(b_tok + str(final_t))
        elif is_pers:
            all_tokens_list.append(str(final_t) + '%')
        else:
            all_tokens_list.append(str(final_t))

    def parse_sentence(self, text):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """
        tokenized_text = []
        text_tokens = word_tokenize(text)
        entity = ''
        entity_counter = 0

        entities_set = set()
        small_big_dict = {}

        for i, token in enumerate(text_tokens):

            if token == ' ':
                continue

            # EMOJIS - extract the token without the emojis
            if re.match(self.emojis_pattern, token):
                token = self.emojis_pattern.sub(r'', token)
                tokenized_text.append(token.lower())

                entity = ''
                entity_counter = 0
                continue

            if token == '@':
                if i < (len(text_tokens) - 1):
                    tokenized_text.append(token + text_tokens[i + 1])
                    text_tokens[i + 1] = ' '  # skip the next token

                    entity = ''
                    entity_counter = 0
                    continue

            if token == '#':
                if i < (len(text_tokens) - 1):
                    self.parse_hashtag(tokenized_text, text_tokens[i + 1])
                    text_tokens[i + 1] = ' '  # skip the next token

                    entity = ''
                    entity_counter = 0
                    continue

            # DATES
            date_match = self.dates_pattern.match(token)
            if date_match:
                tokenized_text.append(token)

            # NUMBERS
            # number_match = self.numbers_pattern_1.match(token) or self.numbers_pattern_2.match(token)
            number_match = self.numbers_pattern.match(token)
            if number_match != None:
                # Numbers over TR
                if len(token) > 18:
                    tokenized_text.append(token)

                    entity = ''
                    entity_counter = 0
                    continue
                start, stop = number_match.span()
                if (stop - start) == len(token):
                    before_t = None
                    after_t = None
                    if i < (len(text_tokens) - 1):
                        after_t = text_tokens[i + 1]
                    if i > 0:
                        before_t = text_tokens[i - 1]
                    self.parse_numbers(tokenized_text, token, before_t,
                                       after_t, text_tokens)

                    entity = ''
                    entity_counter = 0
                    continue

            url_match = self.url_pattern.match(token)
            if url_match:
                if i + 2 < len(text_tokens):
                    if text_tokens[i + 2]:
                        tokenized_text += self.parse_url(text_tokens[i + 2])
                        text_tokens[i + 1] = ' '  # skip the next token
                        text_tokens[i + 2] = ' '  # skip the next token

                        entity = ''
                        entity_counter = 0
                        continue

            # ENTITY AND SMALL_BIG
            if token.isalpha() and token.lower() not in self.stop_words_dict:
                if token[0].isupper():
                    entity += token + ' '
                    entity_counter += 1
                    continue
                else:
                    # entity dict -> decide >= 2 is an entity
                    if entity_counter > 1:
                        # self.entities.append(entity[:-1])
                        entities_set.add(entity[:-1])
                        tokenized_text.append(entity[:-1])
                        entity = ''
                        entity_counter = 0
                        continue
                    # small_big dict for entity
                    elif entity_counter == 1:
                        entity = entity[:1]
                        if entity not in small_big_dict.keys():
                            small_big_dict[token.lower()] = False

                    # now we have small letter token
                    if token not in small_big_dict.keys(
                    ) or not small_big_dict[token]:
                        small_big_dict[token.lower()] = True

            if '-' in token:
                tokenized_text.append(token)
                split_tok = [t.lower() for t in token.split('-')]
                tokenized_text += split_tok
                continue

            # append all regular words
            suffix = "…"
            if self.with_stem:
                token = self.stemmer.stem_term(token)
            token = token.lower()
            if token not in self.stop_words_dict and not token.endswith(
                    suffix) and token != suffix and len(token) > 1:
                tokenized_text.append(token)

        return tokenized_text, entities_set, small_big_dict

    def parse_url(self, token):
        split_url = self.url_www_pattern.split(token)
        if 't.co' in split_url or 'twitter.com' in split_url:
            return [split_url[-1].lower()]
        if len(split_url) > 3 and 'www.' in split_url[3]:
            split_url[3] = split_url[3][4:]
        return [t.lower() for t in split_url if (t != 'https' and t != '')]

    def get_urls(self, all_urls):
        urls = {}
        for url in all_urls:
            if url:
                urls.update(dict(json.loads(url)))
        return urls

    def get_texts(self, all_texts):
        final_text = ""
        for text in all_texts:
            if text:
                final_text += ' ' + text
        return final_text

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """
        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        tweet_date_obj = datetime.strptime(tweet_date, '%a %b %d %X %z %Y')
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        # indices = doc_as_list[4]
        retweet_text = doc_as_list[5]
        retweet_url = doc_as_list[6]
        # retweet_indices = doc_as_list[7]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        # quote_indice = doc_as_list[10]
        retweet_quoted_text = doc_as_list[11]
        retweet_quoted_urls = doc_as_list[12]
        # retweet_quoted_indices = doc_as_list[13]
        term_dict = {}

        tokenized_text = []
        # parse all urls
        urls = self.get_urls(
            [url, retweet_url, quote_url, retweet_quoted_urls])
        for (key, value) in urls.items():
            if value:
                tokenized_text += self.parse_url(value)
            elif key:
                tokenized_text += self.parse_url(key)

        all_texts = self.get_texts(
            [full_text, quote_text, retweet_quoted_text])
        # remove urls from text, only if exist in url
        if len(urls) > 0:
            all_texts = self.url_pattern.sub('', all_texts)

        all_texts = self.non_latin_pattern.sub('', all_texts)

        tokenized_text, entities_set, small_big = self.parse_sentence(
            all_texts)
        unique_terms = set(tokenized_text)

        doc_length = len(tokenized_text)  # after text operations.

        max_tf = 1
        # save only tf for each term in tweet
        for index, term in enumerate(tokenized_text):
            if term not in term_dict:
                term_dict[term] = 1

            else:
                term_dict[term] += 1
                if term_dict[term] > max_tf:
                    max_tf = term_dict[term]

        self.total_len_docs += doc_length
        self.number_of_documents += 1
        # TODO - check if we need to save tokenized_text
        document = Document(tweet_id, max_tf, entities_set, small_big,
                            unique_terms, tweet_date_obj, term_dict,
                            doc_length)

        return document
Пример #15
0
class Parse:
    THOUSAND = 1000
    MILLION = 1000000
    BILLION = 1000000000
    TRILLION = 1000000000000
    QUANTITIES = {
        'thousand': 'K',
        'thousands': 'K',
        'million': 'M',
        'millions': 'M',
        'billion': 'B',
        'billions': 'B',
        'trillion': 'TR',
        'trillions': 'TR'
    }
    SIGNS = {'$': '$', 'usd': '$'}
    QUANTITIES_LIST = ['K', 'M', 'B', 'TR', 'TRX', 'TRXX']

    def __init__(self, config):
        self.with_stem = config.get_toStem()
        self.stemmer = Stemmer()
        self.stop_words = stopwords.words('english')
        self.stop_words.extend(['RT'])
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        # for avg
        self.total_len_docs = 0
        self.number_of_documents = 0

        self.url_removal_pattern = re.compile(r'(https?://[^\s]+)')
        # TODO - fix numbers pattern
        self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*'))
        self.dates_pattern = re.compile(
            r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$'
        )

    # i changed this for more informative words (still in comments)
    def parse_hashtag(self, all_tokens_list, token):
        if len(token) <= 1:
            return
        t = []
        # --> #stay_at_home
        if '_' in token:
            t += re.split(r'_', token)
        else:
            # --> #stayAtHome
            if not token.isupper():
                t += re.findall('[A-Z][^A-Z]*', token)
            # --> #ASD
            else:
                # all_tokens_list.append('#' + token)
                if self.with_stem:
                    token = self.stemmer.stem_term(token)
                if len(token) == 1:
                    return
                all_tokens_list.append(token.lower())
                return

        if self.with_stem:
            t = [self.stemmer.stem_term(x) for x in t]
        else:
            t = [x.lower() for x in t]
        if '' in t:
            t.remove('')
        for term in t:
            if len(term) > 1:
                all_tokens_list.append(term)

    def parse_numbers(self, all_tokens_list, token, before_token, after_token,
                      text_tokens):
        def helper(num):
            count = -1
            while num >= 1000:
                num /= 1000
                count += 1
            # fixed the case of 140.000K
            if num.is_integer():
                num = int(num)
                return num, count
            return ("%.3f" % num), count

        if '/' in token:
            all_tokens_list.append(token)
            return
        if ',' in token:
            token = token.replace(',', '')

        try:
            token = float(token)
        except:
            # from this type - 10.07.2020
            all_tokens_list.append(token)
            return

        if token.is_integer():
            token = int(token)

        b_tok = None
        is_pers = None

        if before_token and before_token in Parse.SIGNS:
            b_tok = Parse.SIGNS[before_token]

        if after_token:
            after_token = after_token.lower()

            if after_token in Parse.QUANTITIES:

                if token < 1000:
                    if b_tok:
                        all_tokens_list.append(b_tok + str(token) +
                                               Parse.QUANTITIES[after_token])
                        return
                    else:
                        all_tokens_list.append(
                            str(token) + Parse.QUANTITIES[after_token])
                        return
                # if we have after and token > 1000
                num, count = helper(token)
                i = Parse.QUANTITIES_LIST.index(
                    Parse.QUANTITIES[after_token]) + 1

                count = count + i
                if count > 2:
                    count = count - 2
                    while (count > 0):
                        num = float(num) * 1000
                        count -= 1
                    if num.is_integer():
                        num = int(num)
                    all_tokens_list.append(str(num) + 'B')
                    return
                else:
                    after_token = Parse.QUANTITIES_LIST[count]
                    all_tokens_list.append(str(num) + after_token)
                    return

            if after_token == 'percent' or after_token == 'percentage' or after_token == '%':
                is_pers = True

        if token < 1000:
            final_t = str(token)
        else:
            num, count = helper(token)
            try:
                # more then B
                if count > 2:
                    count = count - 2
                    while (count > 0):
                        num = float(num) * 1000
                        count -= 1
                    if num.is_integer():
                        num = int(num)
                    final_t = str(num) + 'B'
                else:
                    after = Parse.QUANTITIES_LIST[count]
                    final_t = str(num) + after
            except:
                pass
        if b_tok:
            all_tokens_list.append(b_tok + str(final_t))
        elif is_pers:
            all_tokens_list.append(str(final_t) + '%')
        else:
            all_tokens_list.append(str(final_t))

    def is_cool(self, token):
        if type(token) == int:
            return True
        if len(token) == 0:
            return False
        if token in self.stop_words_dict:
            return False
        return all((ord(char) > 32) and (ord(char) < 128) for char in token)

    def parse_sentence(self, text):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """

        tokenized_text = []
        text_tokens = word_tokenize(text)

        entities_set = set()
        small_big_dict = {}

        skip = False

        for i, token in enumerate(text_tokens):

            if (skip):
                skip = False
                continue

            if self.is_cool(token):

                if token == '#':
                    if i < (len(text_tokens) - 1):
                        self.parse_hashtag(tokenized_text, text_tokens[i + 1])
                        skip = True

                # DATES
                date_match = self.dates_pattern.match(token)
                if date_match:
                    tokenized_text.append(token)

                # NUMBERS
                number_match = self.numbers_pattern.match(token)
                if number_match != None:
                    # Numbers over TR
                    if len(token) > 18:
                        tokenized_text.append(token)

                    start, stop = number_match.span()
                    if (stop - start) == len(token):
                        before_t = None
                        after_t = None
                        if i < (len(text_tokens) - 1):
                            after_t = text_tokens[i + 1]
                        if i > 0:
                            before_t = text_tokens[i - 1]
                        self.parse_numbers(tokenized_text, token, before_t,
                                           after_t, text_tokens)

                if ('.' in token) and (len(token) > 1) and any(c.isalpha()
                                                               for c in token):
                    tokenized_text.append(token)

                if '-' in token and len(token) > 1:
                    if token == '--':
                        continue
                    if self.with_stem:
                        token = self.stemmer.stem_term(token)
                    tokenized_text.append(token.lower())

                if token.isalpha(
                ) and token not in self.stop_words_dict and token.lower(
                ) not in self.stop_words_dict and len(token) > 1:
                    if token not in self.stop_words_dict and len(token) > 1:
                        if self.with_stem:
                            token = self.stemmer.stem_term(token)
                        tokenized_text.append(token.lower())

        return tokenized_text, entities_set, small_big_dict

    def url_parse(self, token):
        domain = token.split("//")[-1].split("/")[0].split('?')[0]
        if 'www' in domain and 'com' in domain:
            domain = domain.split('.')
            return domain[1]

    def get_urls(self, all_urls):
        urls = {}
        for url in all_urls:
            if url:
                urls.update(dict(json.loads(url)))
        return urls

    def get_texts(self, all_texts):
        final_text = ""
        for text in all_texts:
            if text:
                final_text += ' ' + text
        return final_text

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """
        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        tweet_date_obj = datetime.strptime(tweet_date, '%a %b %d %X %z %Y')
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        retweet_url = doc_as_list[6]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        retweet_quoted_text = doc_as_list[11]
        retweet_quoted_urls = doc_as_list[12]
        term_dict = {}

        tokenized_text = []
        # parse all urls
        urls = self.get_urls(
            [url, retweet_url, quote_url, retweet_quoted_urls])
        for (key, value) in urls.items():
            if value:
                domain = self.url_parse(value)
                if domain:
                    tokenized_text += domain

        all_texts = self.get_texts(
            [full_text, quote_text, retweet_quoted_text])
        # remove urls from the text
        all_texts = self.url_removal_pattern.sub('', all_texts)

        tokenized_text, entities_set, small_big = self.parse_sentence(
            all_texts)
        unique_terms = set(tokenized_text)
        doc_length = len(tokenized_text)  # after text operations.

        max_tf = 1
        # save only tf for each term in tweet
        for index, term in enumerate(tokenized_text):
            if term not in term_dict:
                term_dict[term] = 1

            else:
                term_dict[term] += 1
                if term_dict[term] > max_tf:
                    max_tf = term_dict[term]

        self.total_len_docs += doc_length
        self.number_of_documents += 1
        # TODO - check if we need to save tokenized_text
        document = Document(tweet_id, max_tf, entities_set, small_big,
                            unique_terms, tweet_date_obj, term_dict,
                            doc_length)

        return document