Пример #1
0
def test_usage_single_emoji(nlp, icon):
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, last=True)
    doc = nlp(u"Hello %s world" % icon)
    assert doc._.has_emoji
    assert doc[1]._.is_emoji
    assert doc[1]._.emoji_desc == emoji.get_emoji_desc(doc[1])
    assert doc[1:3]._.has_emoji
    assert len(doc._.emoji) == 1
    emoji_text, emoji_idx, emoji_desc = doc._.emoji[0]
    assert emoji_text == icon
    assert emoji_idx == 1
Пример #2
0
def test_usage_no_emoji(nlp):
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, last=True)
    doc = nlp(u"In total there are 2,666 emojis in the Unicode Standard.")
    assert not doc._.has_emoji
    for token in doc:
        assert not token._.is_emoji
Пример #3
0
def get_spacy_nlp(core, emojis=True):
    nlp = spacy.load(core)
    if emojis:
        emoji = Emoji(nlp)
        nlp.add_pipe(emoji, first=True)

    return nlp
Пример #4
0
def create_nlp_instance():
    import spacy
    from spacymoji import Emoji

    nlp = spacy.load('en')
    emoji_pipe = Emoji(nlp)
    nlp.add_pipe(emoji_pipe, first=True)

    # Merge hashtag tokens which were split by spacy
    def hashtag_pipe(doc):
        merged_hashtag = False
        while True:
            for token_index, token in enumerate(doc):
                if token.text == '#':
                    if token.head is not None:
                        start_index = token.idx
                        end_index = start_index + len(token.head.text) + 1
                        if doc.merge(start_index, end_index) is not None:
                            merged_hashtag = True
                            break
            if not merged_hashtag:
                break
            merged_hashtag = False
        return doc

    nlp.add_pipe(hashtag_pipe)
    return nlp
Пример #5
0
def test_lookup(nlp):
    emoji = Emoji(nlp, lookup={'👨‍🎤': 'David Bowie'})
    nlp.add_pipe(emoji, last=True)
    doc = nlp(u"We can be 👨‍🎤 heroes")
    assert doc._.has_emoji
    assert doc[3]._.is_emoji
    assert doc[3]._.emoji_desc == 'David Bowie'
Пример #6
0
 def build_pipeline(self):
     """Build spaCy pipeline."""
     # Add spacymoji
     emoji = Emoji(self.nlp, merge_spans=False)
     self.nlp.add_pipe(emoji, first=True)
     # Add entity skipping
     self.nlp.add_pipe(self.skip_ents, after='ner')
Пример #7
0
def test_usage_multiple_emoji(nlp):
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, last=True)
    doc = nlp(u"Hello 😻🍕 world, this ✨ 💥 is an example.")
    assert doc._.has_emoji
    assert len(doc._.emoji) == 4
    assert doc[:5]._.has_emoji
    assert len(doc[:5]._.emoji) == 2
Пример #8
0
    def __init__(self):
        self.nlp = it_core_news_sm.load()
        emoji = Emoji(self.nlp)
        sentencizer = self.nlp.create_pipe("sentencizer")

        #Add components to the pipeline
        self.nlp.add_pipe(emoji, first=True)
        self.nlp.add_pipe(hashtag_pipe, first=True)
        self.nlp.add_pipe(sentencizer)
Пример #9
0
def test_custom_attrs():
    attrs = ('contains_emoji', 'equals_emoji', 'emoji_details', 'all_emoji')
    nlp = English()
    emoji = Emoji(nlp, attrs=attrs)
    nlp.add_pipe(emoji, last=True)
    doc = nlp(u"Hello 🎉")
    assert doc._.all_emoji
    assert len(doc._.all_emoji) == 1
    assert doc[1]._.has('equals_emoji')
    assert doc[1]._.emoji_details
def get_nlp_v2():
    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, first=True)

    # bug with spacy https://github.com/explosion/spaCy/issues/1574
    for word in nlp.Defaults.stop_words.difference(
            ext_spacy.stop_words_modified):
        nlp.vocab[word].is_stop = False

    return nlp
def setup_spacy():
    """
    Setup spacy parameters
    Returns
    -------
    spacy.lang.en.English
    """
    nlp = en_core_web_sm.load()
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, first=True)
    return nlp
Пример #12
0
def test_usage_merge_overlapping(nlp):
    text = '🇺🇸🇦🇷'
    assert len(text) == 4

    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, last=True)
    doc = nlp(text)

    assert len(doc) == 2
    assert doc[0].orth_ == text[0:2]
    assert doc[1].orth_ == text[2:4]
Пример #13
0
def test_usage_merge_spans(nlp, emoji):
    text = u"This is %s a test" % emoji
    emoji = Emoji(nlp)
    doc = nlp(text)
    assert len(doc) > 5
    nlp.add_pipe(emoji, last=True)
    doc = nlp(text)
    assert len(doc) == 5
    assert doc._.has_emoji
    assert doc[2]._.is_emoji
    assert len(doc[2].text) > 1
Пример #14
0
 def load_sapcy(self, lang):
     result = None
     try:
         stemmer_text = Steaming(lang)  # initialise component
         result = spacy.load('es_core_news_md') if lang == 'es' else spacy.load('en_core_web_md')
         emoji = Emoji(result)
         result.add_pipe(emoji, first=True)
         result.add_pipe(stemmer_text, after='parser', name='stemmer')
         print('Language: {0}\nText Analysis: {1}'.format(lang, result.pipe_names))
     except Exception as e:
         Util.standard_error(sys.exc_info())
         print('Error load_sapcy: {0}'.format(e))
     return result
Пример #15
0
def test():
    import spacy
    from spacymoji import Emoji

    nlp = spacy.load('es')
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, first=True)

    doc = nlp(u"This is a test 😻 👍🏿")
    assert doc._.has_emoji == True
    assert doc[2:5]._.has_emoji == True
    assert doc[0]._.is_emoji == False
    assert doc[4]._.is_emoji == True
    assert doc[5]._.emoji_desc == u'thumbs up dark skin tone'
    assert len(doc._.emoji) == 2
    assert doc._.emoji[1] == (u'👍🏿', 5, u'thumbs up dark skin tone')
    def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
        """Private method to create a custom spaCy tokenizer for a given language

        Args:
            language: Language code in ISO 639-1 format, cf. https://spacy.io/usage/models#languages

        Returns:
            spaCy Language instance with the tokenizer

        Raises:
            TokenizationError: If something went wrong with the tokenizer creation

        """
        start = perf_counter()
        logging.info(f"Loading tokenizer for language '{language}'...")
        try:
            if language == "th":  # PyThaiNLP requires a "data directory" even if nothing needs to be downloaded
                os.environ["PYTHAINLP_DATA_DIR"] = mkdtemp(
                )  # dummy temp directory
            if language in SPACY_LANGUAGE_MODELS and self.use_models:
                nlp = spacy.load(SPACY_LANGUAGE_MODELS[language])
            else:
                nlp = spacy.blank(
                    language
                )  # spaCy language without models (https://spacy.io/usage/models)
        except (ValueError, OSError) as e:
            raise TokenizationError(
                f"SpaCy tokenization not available for language '{language}' because of error: '{e}'"
            )
        if self.hashtags_as_token:
            re_token_match = spacy.tokenizer._get_regex_pattern(
                nlp.Defaults.token_match)
            re_token_match = r"""({re_token_match}|#\w+)"""
            nlp.tokenizer.token_match = re.compile(re_token_match).match
            _prefixes = list(nlp.Defaults.prefixes)
            if "#" in _prefixes:
                _prefixes.remove("#")
                nlp.tokenizer.prefix_search = spacy.util.compile_prefix_regex(
                    _prefixes).search
        if self.stopwords_folder_path and language in SUPPORTED_LANGUAGES_SPACY:
            self._customize_stopwords(nlp, language)
        logging.info(
            f"Loading tokenizer for language '{language}': done in {perf_counter() - start:.2f} seconds"
        )
        if language not in UNSUPPORTED_SPACY_EMOJI_LANG:
            nlp.add_pipe(Emoji(nlp), first=True)
        return nlp
    def on_status(self, status):
        blacklist = [
            'netflix', 'rt', 'https', 't', 'co', 'q', 'a', 'o', 'e', 'n', 'pq',
            'vc'
        ]

        nlp = spacy.load("pt_core_news_sm")
        emoji = Emoji(nlp)
        nlp.add_pipe(emoji)

        tokens = nlp(status.text.lower())

        words = [
            token.text for token in tokens
            if token.is_stop != True and token.is_punct != True
            and token._.is_emoji != True and token.text not in blacklist
        ]
        word_list.extend(words)

        fdist = nltk.FreqDist(word_list)
        print('10 MAIS FREQUENTES:')
        print(fdist.most_common(10))
        print('\n')
Пример #18
0
def start():
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(Emoji(nlp), first=True)

    return nlp
Пример #19
0
 def __init__(self):
     self.nlp = spacy.load('en_core_web_sm')
     emoji = Emoji(self.nlp)
     self.nlp.add_pipe(emoji, first=True)
Пример #20
0
def test_pattern_id(nlp, pattern_id):
    emoji = Emoji(nlp, pattern_id=pattern_id)
    assert pattern_id in emoji.matcher
    assert "EMOJI" not in emoji.matcher
Пример #21
0
    size = len(data)

    anger = [0] * size
    anticipation = [0] * size
    disgust = [0] * size
    fear = [0] * size
    joy = [0] * size
    sadness = [0] * size
    surprise = [0] * size
    trust = [0] * size

    index = 0

    nlp = spacy.load("it_core_news_sm")
    emoji = Emoji(nlp, merge_spans=False)
    nlp.add_pipe(emoji, first=True)

    length = len(data)

    for _, row in data.iterrows():

        if index % 100000 == 0:
            print(" {}% of tweets were analysed in {:.2f} seconds".format(
                (index / length),
                time.time() - start_time))

        for token in nlp(row['text']):

            if token._.is_emoji:
Пример #22
0
def main(model='C:/Users/Pasante/Desktop/Gastón/drug_model_v4/spacy_model',
         new_model_name='spacy_model_es_drug',
         output_dir='C:/Users/Pasante/Desktop/Gastón/spacy_model_es_drug',
         n_iter=200):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Modelo cargado '%s'" % model)
    else:
        nlp = spacy.blank('es')  # create blank Language class
        print("Modelo en blanco 'es' creado")

    # Add emojis to pipe
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, first=True)
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL1)  # add new entity label to entity recognizer
    ner.add_label(LABEL2)  # add new entity label to entity recognizer
    losses_max = 99999999999
    t0 = datetime.datetime.now()
    print("Start: ", t0)

    # load test text
    validation_data = list(
        filter(None,
               open('test_text.txt', 'r').read().splitlines()))
    iteration = 0
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()

        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            iteration = iteration + 1
            losses = {}
            for raw_text, entity_offsets in TRAIN_DATA:
                nlp.update([raw_text], [entity_offsets],
                           drop=0.25,
                           sgd=optimizer,
                           losses=losses)
            print(losses)
            print("Iteration:" + str(iteration))
            for text in validation_data:
                doc = nlp(text)
                ents = [(ent.text, ent.label_) for ent in doc.ents]
                for ent, label in ents:
                    print(f'Found entity: "{ent}": "{label}",')

            # save model to output directory
            # take the most successful
            for a in losses.keys():
                if losses[a] < losses_max:
                    losses_max = losses[a]
                    if output_dir is not None:
                        output_dir = Path(output_dir)
                        if not output_dir.exists():
                            output_dir.mkdir()
                        nlp.meta['name'] = new_model_name  # rename model
                        nlp.to_disk(output_dir)
                        print("saved model: ", output_dir)

    # test the trained model
    print("Finished: ", datetime.datetime.now() - t0)
    print("\n\n\n")
def simple_identification():
    client_from = MongoClient()
    db_from = client_from["SSD"]
    coll_from = db_from["raw_data"]
    start_time = time()
    date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S")
    path_to_file = date + " - DetectEmoticonsAndEmojis_Performance.txt"
    p_file = codecs.open(path_to_file, encoding='utf-8', mode='a')
    p_file.write(date +
                 " Detecting Emoticons & Emojis Test - Local Execution" + "\n")
    p_file.flush()
    # II. Prepare data
    p_file.write("Preparing initial data ... " + "\n")
    path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \
                            + 'configuration.ini'
    config = ConfigParser(interpolation=ExtendedInterpolation())
    config.read_file(codecs.open(path_to_configuration, "r", "utf8"))
    # Read emojis
    path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep
    unicode_emoji_list_file = codecs.open(path + "list - unicode_emojis.txt",
                                          encoding='utf-8')
    emoji_list = unicode_emoji_list_file.read().splitlines()
    unicode_emoji_list_file.close()
    aux_emojis_dict = {}
    emojis_dict = {}
    for aux in emoji_list:
        aux_emoji = aux.split('\t')
        aux_emojis_dict[aux_emoji[1]] = [aux_emoji[2], aux_emoji[3]]
        emojis_dict[aux_emoji[2]] = [aux_emoji[1], aux_emoji[3]]
    sorted_aux_emojis_list = sorted(aux_emojis_dict.keys(),
                                    key=len,
                                    reverse=True)
    emojis_list = list()
    for aux_emoji in sorted_aux_emojis_list:
        emojis_list.append(aux_emojis_dict[aux_emoji][0])
    # print(emojis_list)
    # Read complementary characters
    complementary_characters_list_file = codecs.open(
        path + "list - complementary_characters.txt", encoding='utf-8')
    complementary_characters_list = complementary_characters_list_file.read(
    ).splitlines()
    complementary_characters_list_file.close()
    complementary_characters_dict = {}
    for aux in complementary_characters_list:
        aux_char = aux.split('\t')
        complementary_characters_dict[aux_char[2]] = [aux_char[1], aux_char[3]]
    # print(complementary_characters_dict)
    # Read emoticons
    emoticon_list_file = codecs.open(path + "list - emoticons.txt",
                                     encoding='utf-8')
    emoticon_list = emoticon_list_file.read().splitlines()
    emoticon_list_file.close()
    emoticons_dict = {}
    for aux in emoticon_list:
        aux_emoticon = aux.split('\t')
        # print(aux_emoticon)
        emoticons_dict[aux_emoticon[0]] = aux_emoticon[1]
    # print(emoticons_dict)
    # 1. Configure Google_Universal_POS_Tags
    tags = config.options("Google_Universal_POS_Tags")
    google_universal_tags = {}
    for tag in tags:
        google_universal_tags[tag.upper()] = config.get(
            'Google_Universal_POS_Tags', tag)
    # 2. Read special characters (#, @, https, etc.)
    special_characters = ast.literal_eval(
        config.get('TextAnalysis', 'special_characters'))
    additional_symbols = ast.literal_eval(
        config.get('TextAnalysis', 'additional_symbols'))
    variation_selectors = ast.literal_eval(
        config.get('TextAnalysis', 'variation_selectors'))
    # 3. Configure Spanish POS tagger
    spanish_pipeline = spacy.load('es')
    emoji = Emoji(spanish_pipeline)
    spanish_pipeline.add_pipe(emoji, first=True)
    tag_map = spacy.es.TAG_MAP
    all_from_tweets = coll_from.find()
    count = 0
    stop = 100000
    p_file.write("Total data to process: " + str(stop) + "\n")
    emoticons = []
    emojis = []
    complementary_characters = []
    texts = []
    emojis_count = 0
    emoticon_count = 0
    complementary_characters_count = 0
    for raw_data in all_from_tweets:
        if 'text' in raw_data.keys() and 'lang' in raw_data.keys():
            if "place" in raw_data.keys():
                place = raw_data["place"]
                if place is not None:
                    if "country_code" in place.keys():
                        raw_data_country_code = raw_data["place"][
                            "country_code"]
                        if raw_data_country_code in ["CO"]:
                            lang = raw_data["lang"]
                            text = raw_data['text']
                            if lang == 'es':
                                results = identify_special_characters(
                                    text, spanish_pipeline, tag_map,
                                    emoticons_dict, emojis_dict, emojis_list,
                                    variation_selectors,
                                    complementary_characters_dict,
                                    emoticon_count, emojis_count,
                                    complementary_characters_count)

                                spaced_text = results[0]
                                final_clean_text = results[1]
                                emoticons += copy.deepcopy(results[2])
                                emojis += copy.deepcopy(results[3])
                                complementary_characters += copy.deepcopy(
                                    results[4])
                                emoticon_count = results[5]
                                emojis_count = results[6]
                                complementary_characters_count = results[7]
                                if len(results[2]) != 0 or len(
                                        results[3]) != 0 or len(
                                            results[4]) != 0:
                                    texts.append(spaced_text + '\t' +
                                                 final_clean_text)
                                count += 1
                            else:
                                if len(text) >= 3:
                                    blob = TextBlob(text)
                                    detection = True
                                    detected_language = ''
                                    while detection:
                                        try:
                                            detected_language = blob.detect_language(
                                            )
                                            detection = False
                                        except:
                                            print(
                                                'error while getting detected language'
                                            )
                                    if detected_language == 'es':
                                        results = identify_special_characters(
                                            text, spanish_pipeline, tag_map,
                                            emoticons_dict, emojis_dict,
                                            emojis_list, variation_selectors,
                                            complementary_characters_dict,
                                            emoticon_count, emojis_count,
                                            complementary_characters_count)

                                        spaced_text = results[0]
                                        final_clean_text = results[1]
                                        emoticons += copy.deepcopy(results[2])
                                        emojis += copy.deepcopy(results[3])
                                        complementary_characters += copy.deepcopy(
                                            results[4])
                                        emoticon_count = results[5]
                                        emojis_count = results[6]
                                        complementary_characters_count = results[
                                            7]
                                        if len(results[2]) != 0 or len(
                                                results[3]) != 0 or len(
                                                    results[4]) != 0:
                                            texts.append(spaced_text + '\t' +
                                                         final_clean_text)
                                        count += 1
                            print(count)
                            if count == stop:
                                break
    all_from_tweets.close()
    client_from.close()
    p_file.write("Emoticons " + str(len(emoticons)) + "\n")
    emoticons_counter = Counter(emoticons).most_common()
    emoticons_counter_sorted = sorted(emoticons_counter,
                                      key=lambda tup: tup[1])
    for emoticon in emoticons_counter_sorted:
        p_file.write(str(emoticon[0]) + "\t" + str(emoticon[1]) + "\n")
    p_file.write("Total Emoticons: " + str(emoticon_count) + ". Proportion: " +
                 str(emoticon_count / stop) + "\n")
    p_file.write("Emojis " + str(len(emojis)) + "\n")
    emojis_counter = Counter(emojis).most_common()
    emojis_counter_sorted = sorted(emojis_counter, key=lambda tup: tup[1])
    for emoji in emojis_counter_sorted:
        p_file.write(str(emoji[0]) + "\t" + str(emoji[1]) + "\n")
    p_file.write("Total Emojis: " + str(emojis_count) + ". Proportion: " +
                 str(emojis_count / stop) + "\n")
    p_file.write("Complementary Characters " +
                 str(len(complementary_characters)) + "\n")
    cc_counter = Counter(complementary_characters).most_common()
    cc_counter_sorted = sorted(cc_counter, key=lambda tup: tup[1])
    for cc in cc_counter_sorted:
        p_file.write(str(cc[0]) + "\t" + str(cc[1]) + "\n")
    p_file.write("Total Complementary Characters: " +
                 str(complementary_characters_count) + ". Proportion: " +
                 str(complementary_characters_count / stop) + "\n")
    p_file.write("Texts without: " + "\n")
    for text in texts:
        p_file.write(text + "\n")
    p_file.write("Total elements in new list: " + str(count) + "\n")
    execution_time = time() - start_time
    p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) +
                 "\n")
    p_file.flush()
    p_file.close()
Пример #24
0
                file.write(".")


#crea una copia de la lista de entidades encontrada sin repeticiones
def remove_duplicates(lst):
    newlist = []
    for element in lst:
        if element not in newlist:
            newlist.append(element)
    return newlist


#carga del modelo de spacy pre-entrenado con la entidad "DRUG"
nlp = spacy.load('spacy_model_es_drug')
#se agrega al pipeline el reconocimiento de emojis para poder analizar posible relación de estos con drogas
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)
path = 'C:/Users/Gaston Migone/Desktop/Gastón/Drug_Train/Files/*.csv'
files = glob.glob(path)
for name in files:
    try:
        print(f"\nAnalizando archivo: {name}... ")
        doc = open(name, 'r', encoding="utf8").read()
        data = [(x.strip(), len(x) + 1) for x in doc.splitlines() if x]

        drugs_list = []
        per_list = []
        loc_list = []
        org_list = []
        misc_list = []
        me_list = []
Пример #25
0
def test_integration(nlp):
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, last=True)
    assert nlp.pipe_names[-1] == 'emoji'
Пример #26
0
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import lemminflect
from collections import Counter, defaultdict
from spacy_langdetect import LanguageDetector
from spacymoji import Emoji
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
# from afinn import Afinn
import re
import json

nlp = spacy.load('en_core_web_md')
nlp.add_pipe(Emoji(nlp, merge_spans=False), first=True)
nlp.add_pipe(SpacyTextBlob())
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

# Comment out for quick summary
from summarizer import Summarizer

model = Summarizer()

# afinn = Afinn()


def get_summary(raw_text, category_list=[]):

    # file = 'raw_text.txt'
    # with open(file) as f:
def simple_identification():
    client_from = MongoClient()
    db_from = client_from["SSD"]
    coll_from = db_from["raw_data"]
    start_time = time()
    date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S")
    path_to_file = date + " - DetectRegexEmoticonsRawData_Performance.txt"
    p_file = codecs.open(path_to_file, encoding='utf-8', mode='a')
    p_file.write(date + " Detecting Emoticons with Regex Expression Test - Local Execution" + "\n")
    p_file.flush()
    # II. Prepare data
    p_file.write("Preparing initial data ... " + "\n")
    path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \
                            + 'configuration.ini'
    config = ConfigParser(interpolation=ExtendedInterpolation())
    config.read_file(codecs.open(path_to_configuration, "r", "utf8"))
    # print(emoticons_dict)
    # 3. Configure Spanish POS tagger
    spanish_pipeline = spacy.load('es')
    emoji = Emoji(spanish_pipeline)
    spanish_pipeline.add_pipe(emoji, first=True)
    all_from_tweets = coll_from.find()
    count = 0
    stop = 1000
    p_file.write("Total data to process: " + str(stop) + "\n")
    emoticons = []
    emoticon_pattern = r"([:;=]-?([\)D\(\C\co]+|[xX][dD]+))|([:<][3])|(<\\3)|()"
    emoticons_characters = ['X', 'x', 'd', 'D', ')', '(', ':', ';', '\'', '*', '=', '/', '$', '#', '-', 'C', 'c', '<', '3', '0', 'O', 'o']
    texts = []
    no_texts = []
    emoticon_count = 0
    for raw_data in all_from_tweets:
        if 'text' in raw_data.keys() and 'lang' in raw_data.keys():
            if "place" in raw_data.keys():
                place = raw_data["place"]
                if place is not None:
                    if "country_code" in place.keys():
                        raw_data_country_code = raw_data["place"]["country_code"]
                        if raw_data_country_code in ["CO"]:
                            lang = raw_data["lang"]
                            text = raw_data['text']
                            raw_entities = raw_data['entities']
                            if lang == 'es':
                                results = identify_emoticons(text, raw_entities, spanish_pipeline, emoticon_count,
                                                             emoticon_pattern, emoticons_characters)
                                text = results[0]
                                clean_text = results[1]
                                emoticon_count = results[2]
                                special_entities = results[3]
                                emoticons += copy.deepcopy(results[4])
                                if len(results[4]) != 0:
                                    texts.append(text + '\t' + clean_text + '\t' + str(special_entities))
                                else:
                                    no_texts.append(text + '\t' + clean_text)
                                count += 1
                            else:
                                if len(text) >= 3:
                                    blob = TextBlob(text)
                                    detection = True
                                    detected_language = ''
                                    while detection:
                                        try:
                                            detected_language = blob.detect_language()
                                            detection = False
                                        except:
                                            print('error while getting detected language')
                                    if detected_language == 'es':
                                        results = identify_emoticons(text, raw_entities, spanish_pipeline,
                                                                     emoticon_count,
                                                                     emoticon_pattern, emoticons_characters)
                                        text = results[0]
                                        clean_text = results[1]
                                        emoticon_count = results[2]
                                        special_entities = results[3]
                                        emoticons += copy.deepcopy(results[4])
                                        if len(results[4]) != 0:
                                            texts.append(text + '\t' + clean_text + '\t' + str(special_entities))
                                        else:
                                            no_texts.append(text + '\t' + clean_text)
                                        count += 1
                            print(count)
                            if count == stop:
                                break
    all_from_tweets.close()
    client_from.close()
    p_file.write("Emoticons " + str(len(emoticons)) + "\n")
    emoticons_counter = Counter(emoticons).most_common()
    emoticons_counter_sorted = sorted(emoticons_counter, key=lambda tup: tup[1])
    for emoticon in emoticons_counter_sorted:
        p_file.write(str(emoticon[0]) + "\t" + str(emoticon[1]) + "\n")
    p_file.write("Total Emoticons: " + str(emoticon_count) + ". Proportion: " + str(emoticon_count / stop) + "\n")
    p_file.write("TEXTS WITH EMOTICONS: \n")
    for text in texts:
        p_file.write(text + "\n")
    p_file.write("TEXTS WITHOUT EMOTICONS: \n")
    for text in no_texts:
        p_file.write(text + "\n")
    p_file.write("Total elements in new list: " + str(count) + "\n")
    execution_time = time() - start_time
    p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) + "\n")
    p_file.flush()
    p_file.close()
Пример #28
0
def simple_identification():
    client_from = MongoClient()
    db_from = client_from["SSD"]
    coll_from = db_from["raw_data"]
    start_time = time()
    date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S")
    path_to_file = date + " - DetectEmojisWithSpacymoji_Performance.txt"
    p_file = codecs.open(path_to_file, encoding='utf-8', mode='a')
    p_file.write(date +
                 " Detecting Emojis with Spacymoji Test - Local Execution" +
                 "\n")
    p_file.flush()
    # II. Prepare data
    p_file.write("Preparing initial data ... " + "\n")
    path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \
                            + 'configuration.ini'
    config = ConfigParser(interpolation=ExtendedInterpolation())
    config.read_file(codecs.open(path_to_configuration, "r", "utf8"))
    path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep
    # Read complementary characters
    complementary_characters_list_file = codecs.open(
        path + "list - complementary_characters.txt", encoding='utf-8')
    complementary_characters_list = complementary_characters_list_file.read(
    ).splitlines()
    complementary_characters_list_file.close()
    complementary_characters_dict = {}
    for aux in complementary_characters_list:
        aux_char = aux.split('\t')
        complementary_characters_dict[aux_char[2]] = [aux_char[1], aux_char[3]]
    # print(complementary_characters_dict)
    # 3. Configure Spanish POS tagger
    spanish_pipeline = spacy.load('es')
    emoji = Emoji(spanish_pipeline)
    spanish_pipeline.add_pipe(emoji, first=True)
    tag_map = spacy.es.TAG_MAP
    # start
    all_from_tweets = coll_from.find()
    count = 0
    stop = 1000
    p_file.write("Total data to process: " + str(stop) + "\n")
    for raw_data in all_from_tweets:
        if 'text' in raw_data.keys() and 'lang' in raw_data.keys():
            if "place" in raw_data.keys():
                place = raw_data["place"]
                if place is not None:
                    if "country_code" in place.keys():
                        raw_data_country_code = raw_data["place"][
                            "country_code"]
                        if raw_data_country_code in ["CO"]:
                            lang = raw_data["lang"]
                            text = raw_data['text']
                            if lang == 'es':
                                identify_special_characters(
                                    text, spanish_pipeline, tag_map, p_file)
                                count += 1
                            else:
                                if len(text) >= 3:
                                    blob = TextBlob(text)
                                    detection = True
                                    detected_language = ''
                                    while detection:
                                        try:
                                            detected_language = blob.detect_language(
                                            )
                                            detection = False
                                        except:
                                            print(
                                                'error while getting detected language'
                                            )
                                    if detected_language == 'es':
                                        identify_special_characters(
                                            text, spanish_pipeline, tag_map,
                                            p_file)
                                        count += 1
                            print(count)
                            if count == stop:
                                break
    all_from_tweets.close()
    client_from.close()
    p_file.write("Total elements in new list: " + str(count) + "\n")
    execution_time = time() - start_time
    p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) +
                 "\n")
    p_file.flush()
    p_file.close()
Пример #29
0
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

analyzer = SentimentIntensityAnalyzer()
import spacy
from spacymoji import Emoji

nlp = spacy.load('en')
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)

nlp1 = spacy.load('en')


def add_compound(set):
    emotion_set = []
    for index, row in set.iterrows():
        vs = analyzer.polarity_scores(row['twitters'])
        emotion_set.append(vs['compound'])
    set['emotion_score'] = emotion_set
    return set


def add_emojis(frame):

    emojisite = []
    for index, row in frame.iterrows():
        num_of_emoji = 0
        post = row['twitters']
        try:
            tokens = nlp(post)