Python demojize示例，emoji.demojize Python示例

示例#1

0

显示文件

文件： utils.py 项目： KevinJMao/corji

def text_contains_emoji(text):
    for char in text:
        if emoji.demojize(char) != char:
            return True

    # Edge case: check for flags as they're represented as multiple chars:
    # https://en.wikipedia.org/wiki/Regional_Indicator_Symbol
    if len(text) == 2:
        spaced_string = "{} {}".format(text[0], text[1])
        if emoji.demojize(spaced_string) != spaced_string:
            return True

    return False

示例#2

0

显示文件

文件： twitter.py 项目： YoApp/yostat.us

    def on_status(self, tweet):
        try:
            twitter_user_id = str(tweet.user.id)
            splitted = tweet.text.split(' ')
            if len(splitted) <= 3:

                emoji_status = splitted[1]
                is_valid_emoji = emoji.demojize(emoji_status) != emoji_status

                if not is_valid_emoji:
                    app_api.update_status(status=u'@' + tweet.user.screen_name + u' try again with a single emoji: ".YoApp ðŸ˜‚"')
                    return

                yo_access_token = redis_store.get('yo.token.for.twitter.user.id:' + twitter_user_id)
                if not yo_access_token:
                    app_api.update_status(status=u'@' + tweet.user.screen_name + u' let\'s link your twitter to your Yo Status here: https://yostat.us/twitter/authorize')
                    return

                response = requests.post('https://api.justyo.co/status/', json={
                    'status': emoji_status,
                    'access_token': yo_access_token
                })

                if response.status_code == 200:
                    app_api.update_status(status=u'@' + tweet.user.screen_name + u' your status is now: ' + emoji)
                else:
                    app_api.update_status(status=u'@' + tweet.user.screen_name + u' let\'s link your twitter to your Yo Status here: https://yostat.us/twitter/authorize')
        except Exception as e:
            print e.message

示例#3

0

显示文件

文件： whatsapp.py 项目： novoid/Memacs

    def _handle_message(self, msg):
        """parse a single message row"""

        msg['number'] = '00' + msg['number'].split('@')[0]
        msg['name'] = self._numberdict.get(msg['number'],msg['number'])
        msg['verb'] = 'to' if msg['type'] else 'from'
        msg['type'] = 'OUTGOING' if msg['type'] else 'INCOMING'
        msg['handler'] = self._args.handler

        if msg['text']:
            if self._args.demojize:
                msg['text'] = emoji.demojize(msg['text'])

            if self._args.skip_emoji:
                msg['text'] = re.sub(emoji.get_emoji_regexp(), '', msg['text'])

        timestamp = datetime.datetime.fromtimestamp(msg['timestamp'] / 1000)

        properties = OrgProperties(data_for_hashing=json.dumps(msg))
        properties.add('NUMBER', msg['number'])
        properties.add('TYPE', msg['type'])

        output = self._args.output_format.format(**msg)

        if msg['text'] and not self._is_ignored(msg):
            self._writer.write_org_subitem(timestamp=OrgFormat.datetime(timestamp),
                                           output=output, properties=properties)

示例#4

0

显示文件

文件： emoji_daily_counts.py 项目： hopeemac/Endorsements_Data

def get_emoji_counts(master, emoji_counts, candidate):
    if candidate not in emoji_counts.keys():
        emoji_counts[candidate] = {}
    for key in master.keys():
        tweet = master[key][0]
        date = master[key][1]
        date = datetime.datetime.strptime(date,'%a %b %d %H:%M:%S %Z %Y')
        date_ft = date.strftime('%m-%d-%Y')
        
        # Replace all URLs in Tweet (to avoid confusion with emoticon)
        tweet = re.sub('htt[^ ]*' ,'URL', tweet)
        
        tokens = twtokenizer.tokenize(tweet)
        tokens = [emoji.demojize(token) for token in tokens]
        # tokens = [word for word in tokens if word not in string.punctuation]

        for token in tokens:
            if re.match(':+[a-z_]*:*',token):
                if date_ft not in emoji_counts[candidate].keys():
                    emoji_counts[candidate][date_ft] = {}
                if token in emoji_counts[candidate][date_ft]:
                    emoji_counts[candidate][date_ft][token] +=1
                else:
                    emoji_counts[candidate][date_ft][token] = 1
    return emoji_counts

示例#5

0

显示文件

文件： Spectral_encoding.py 项目： julian-wills/MEC

def clean(instring, spaces = True): #removes punctuation and double spaces, replacing them w/ single spaces
    instring.replace("\n"," ")
    for x in punctuation:
            instring = instring.replace(x, " ")
    instring = emoji.demojize(instring) #demojize turns emojis into text with this format: :emoji_text_alias:
    if instring.find(":") > -1: #then the tweet has emojis!
        inlist = instring.split()
        moreEmoji = True
        while moreEmoji:
            try:
                beginning = inlist.index(":")
                end = inlist.index(":",beginning)
                inlist.insert(end+1," ")
            except ValueError:
                moreEmoji = False
        instring = ""
        for x in inlist:
            instring += x
    if spaces:
        while instring.find("  ") > -1: #remove double spaces
            instring = instring.replace("  ", " ")
    else:
        while instring.find(" ") > -1:  #remove all spaces
            instring = instring.replace(" ","")
    instring = instring.lower()
    return instring

示例#6

0

显示文件

文件： comment_util.py 项目： Skullmaker90/InstaPy

def comment_image(browser, comments):
  """Checks if it should comment on the image"""
  rand_comment = (choice(comments))
  rand_comment = emoji.demojize(rand_comment)
  rand_comment = emoji.emojize(rand_comment, use_aliases=True)



  comment_input = browser.find_elements_by_xpath('//textarea[@placeholder = "Add a comment…"]')
  if len(comment_input) <= 0:
    comment_input = browser.find_elements_by_xpath('//input[@placeholder = "Add a comment…"]')

  if len(comment_input) > 0:
    browser.execute_script("arguments[0].value = '" + rand_comment + " ';", comment_input[0]);
    #An extra space is added here and then deleted. This forces the input box to update the reactJS core
    comment_input[0].send_keys("\b")
    comment_input[0].submit()
  else:
    print(u'--> Warning: Comment Action Likely Failed: Comment Element not found')
    # print(u'--> Commented: {}'.format(rand_comment))
  #print("--> Commented: " + rand_comment.encode('utf-8'))
  print("--> Commented: {}".format(rand_comment.encode('utf-8')))
  sleep(2)


  return 1

示例#7

0

显示文件

文件： Emoji_class_helper.py 项目： aelydens/teamworldbot

    def decrypt(self, encrypted_message):
        # Simple Ceasar Cypher, the emoji-key index position marks 'a', the rest of the alphabet is defined from starting index 'a'
        # cipher dict is regenerated as in Encrpyt, but then key value pairs are reversed
        # returns decrypted message in text

        if self.cipher == None:
            self.cipher = self.define_cipher()

        #reverse the cipher
        rev_cipher= {v: k for k, v in self.cipher.items()}
  
        decrypted = []
        encrypted_message = (emoji.demojize(encrypted_message))

        # this handles the combination character emojis- like sign_of_the_horns_light_skin_tone
        # a space is designated with a ~
        # then lines are split based on :
        line = re.sub(' ', '~', encrypted_message)
        line = re.sub(':', ' ', line)
        line_list = (line.split())
        for symbol in line_list:
            mod_symbol = ':'+symbol+':'
            if mod_symbol in rev_cipher:
                decrypted.append(rev_cipher[mod_symbol])
            else:
                mod_symbol = re.sub('~', ' ', mod_symbol)
                mod_symbol = re.sub(':', '', mod_symbol)
          
                decrypted.append(mod_symbol)

        return ''.join(decrypted)

示例#8

0

显示文件

文件： emojinal.py 项目： sprax/python

def test_misc():
    trans()
    print(u'\U0001f604'.encode('unicode-escape'))
    print(u'\U0001f604')
    ss = u'\U0001f604'
    xx = chr(ss[0])
    print("ss({}) xx({})".format(ss, xx))
    # -*- coding: UTF-8 -*-
    #convert to unicode
    teststring =  "I am happy \U0001f604"
    # #teststring = unicode(teststring, 'utf-8')

    #encode it with string escape
    teststring = teststring.encode('unicode_escape')
    print("💗 Growing Heart")
    print(emoji.emojize('Water! :water_wave:'))
    print(emoji.demojize(u'🌊')) # for Python 2.x
# print(emoji.demojize('🌊')) # for Python 3.x.
    print(u"And \U0001F60D")
    print("(-woman) astronaut", chr(int("0001f680", 16)))
    print("woman_astronaut", chr(int("0x0001f680", 0)))

    print("\U0001f483\U0001f3fe")

    print(chr(0x001f483),chr(0x001f3fe))
    print('💃 🏾 ')
    print(chr(0x001f483)+chr(0x001f3fe))
    print('💃🏾 ')
    print(chr(int('1f483',16))+chr(int('1f3fe',16)))
    print('%8s %8s %8s' % qw_tuple('surf wave whitecap'))
    print("('%s', '%s', '%s')" % qw_tuple("surf's-up wave rip-curl"))

示例#9

0

显示文件

文件： cache.py 项目： KevinJMao/corji

def get_from_local_cache(raw_emoji):
    filename = emoji.demojize(raw_emoji).replace(":", "")
    cached_filename = Config.CACHE_DIR + "/" + filename
    split_name = filename.split('/')
    if(os.path.exists(cached_filename)):
        return Config.CACHE_DIR + "/" + split_name[0] + "/01.jpg"
    else:
        raise CorgiNotFoundException("Corgi not found for emoji: {}"
                                     .format(raw_emoji))

示例#10

0

显示文件

文件： mongo2sql.py 项目： Chobicus/b92statistike

def workaround_freetds_bug(text):
    """
    Emoticons in Instagram posts are outside of 0xffff unicode range
    TDS doesn't like this. We need to use emoji package to convert
    those pesky emoticons to text + there are some other emoticons
    where emoji fails, I guess I should update emoji DB.
    """
    text = emoji.demojize(text)
    text = text.replace(u'ðŸ‡«ó¾“®', u' ')
    text = text.replace(u'ðŸ‡º', u' ')
    text = text.replace(u'ðŸ‡º', u' ')
    return text

示例#11

0

显示文件

文件： parsers.py 项目： javierhonduco/emoji-ml

def file_parser(filepath):
    data = []
    with open(filepath, "r") as file:
        for line in file.readlines():
            text = emoji.demojize(line).rstrip("\n")
            extracted_emojis = EMOJI_NAMES_PATTERN.findall(text)

            for emoji_name in extracted_emojis:
                text = EMOJI_NAMES_PATTERN.sub("", text)

                data.append((text.strip(), emoji_name.strip()))

    return data

示例#12

0

显示文件

文件： cache.py 项目： KevinJMao/corji

def put_in_local_cache(corgis):
    for i in corgis:
        corgi = corgis.get(i, None)
        if not corgi:
            continue

        emoji_dir = emoji.demojize(i).replace(":", "")
        try:
            directory = Config.CACHE_DIR + '/' + emoji_dir
            if not os.path.exists(directory):
                os.makedirs(directory)
                urllib.request.urlretrieve(corgi, directory + "/01.jpg")
        except:
            logger.error("Failed on: " + i)

示例#13

0

显示文件

文件： streamCandidates.py 项目： Colemalban/ElectoralRoast.com

	def on_data(self, data):
		
		data = str(emoji.demojize(data))
		
		decoded = json.loads(str(data))
		if 'place' in decoded and decoded['place'] is not None:
			loc = decoded['place']['bounding_box']['coordinates'][0][0]
			
			tweet = str(emoji.demojize(decoded['text']).encode("unicode_escape"))
			tweet = tweet[1:]
			tweet = tweet.strip("\n")
			tweet = tweet.strip("\.")

			tweet = tweet.replace("\n",". ")
			tweet = tweet.replace("\\'","'")
			tweet = tweet.replace("\\","")
			tweet = tweet.replace("\\\.",".")
			tweet = tweet.replace("\"", "'")
			tweet = tweet.replace("\\n",". ")
			print (tweet)
			tweetLower = tweet.lower()
			if("trump" in tweetLower):
				trump.write('{"tweet": "' + tweet +'", "coordinates": ' + str(loc) + '}\n')
				trump.flush()
			if("sanders" in tweetLower or "bernie" in tweet.lower()):
				bernie.write('{"tweet": "' + tweet +'", "coordinates": ' + str(loc) + '}\n')
				bernie.flush()
			if("clinton" in tweetLower):
				clinton.write('{"tweet": "' + tweet +'", "coordinates": ' + str(loc) + '}\n')
				clinton.flush()
			if("rubio" in tweetLower):
				rubio.write('{"tweet": "' + tweet +'", "coordinates": ' + str(loc) + '}\n')
				rubio.flush()
			if("cruz" in tweetLower):
				cruz.write('{"tweet": "' + tweet +'", "coordinates": ' + str(loc) + '}\n')
				cruz.flush()
		return True

示例#14

0

显示文件

文件： Client.py 项目： hawkins/Shawk

    def default_text_handler(self, client, message):
        """
        This is the default text handler provided by Shawk.

        If self.demojize is True, this converts emoji to text and prints the message.
        Otherwise, this simply prints the raw message text.
        """

        greeting = "Shawk received message"

        if self.demojize:
            demojized_text = emoji.demojize(message.text)
            print("{}: {}".format(greeting, demojized_text))
        else:
            print("{}: {}".format(greeting, message.text))

示例#15

0

显示文件

文件： __init__.py 项目： opsdroid/opsdroid

 async def send_reaction(self, reaction):
     """React to a message."""
     emoji = demojize(reaction.emoji)
     _LOGGER.debug("Reacting with: %s", emoji)
     try:
         await self.slacker.reactions.post('reactions.add', data={
             'name': emoji,
             'channel': reaction.target,
             'timestamp': reaction.linked_event.raw_event['ts']
         })
     except slacker.Error as error:
         if str(error) == 'invalid_name':
             _LOGGER.warning('Slack does not support the emoji %s', emoji)
         else:
             raise

示例#16

0

显示文件

文件： Twitter_Methods.py 项目： hopeemac/Endorsements_Data

def clean_tweets(tweet):
    # Need to First Clean Out URLs before Tokenization
    tweet = re.sub('htt[^ ]*' ,'URL', tweet)
    
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    cleanWords = twtokenizer.tokenize(tweet)
    
    # Convert to Lowercase
    cleanWords = [t.lower() for t in cleanWords]
    
    # Convert Emoji's to Word Label
    cleanWords = [emoji.demojize(word) for word in cleanWords]
    

    # Normalize (remove punctuation)
    #Remove punctuation
    cleanWords = [word for word in cleanWords if word not in punctuation]
    
    # punc = string.punctuation
    # cleanWords = [t for t in cleanWords if t not in punc]
    # cleanWords = [re.sub('[^0-9a-z]', "", x) for x in cleanWords]
    
    # Remove Empty Vectors
    cleanWords = [x for x in cleanWords if x != '']
     
    # Remove StopWords
    # cleanWords = [word for word in cleanWords if word not in stopwords_short]
    cleanWords = [word for word in cleanWords if word not in stopwords]
    
    # Identify Digits & Convert to Num
    # cleanWords = [re.sub("\d+", "NUM", x) for x in cleanWords]
    
    # Remove all Web/URL References (Replace with String Replacement Above)
    # Could be better to replace with 'URL'
    # cleanWords = [word for word in cleanWords if word[0:3] != 'htt']
    # cleanWords = ['URL' if word[0:3] == 'htt' else word for word in cleanWords ]
    
    # Stem Words
    #cleanWords = [stemmer.stem(x) for x in cleanWords] # call stemmer to stem the input
    
    # Remove Multiple Letters, Replace with only 3 so they are distinguishable, but standardized
    # cleanWords = [re.sub(r'(.)\1{2,}', r'\1\1\1', word) for word in cleanWords ]
    
    # Change all @ References to USER
    # cleanWords = ['USER' if word[0] == '@' else word for word in cleanWords ]
    
    
    return cleanWords

示例#17

0

显示文件

文件： api.py 项目： jmduke/corji

 def get_all(self):
     all_emojis = google_spreadsheets.keys(include_empty_keys=True)
     corjis = []
     for this_emoji in all_emojis:
         corgi_urls = ""
         if settings.Config.REMOTE_CACHE_RETRIEVE:
             try:
                 corgi_urls = s3.get_all(this_emoji)
             except CorgiNotFoundException as e:
                 logger.warn("Corji not found for emoji %s", this_emoji)
         if not corgi_urls:
             corgi_urls = google_spreadsheets.get_all(this_emoji)
         emoji_name = emoji.demojize(this_emoji).replace(":", "")
         corjis.append({
             "urls": corgi_urls,
             "emoji": this_emoji,
             "emoji_name": emoji_name
         })
     return {
         "count": len(corjis),
         "emojis": [corji["emoji"] for corji in corjis],
         "results": corjis
     }

示例#18

0

显示文件

文件： comment_util.py 项目： ceribbo/InstaPy

def comment_image(browser, username, comments, blacklist, logger, logfolder):
    """Checks if it should comment on the image"""
    rand_comment = (choice(comments).format(username))
    rand_comment = emoji.demojize(rand_comment)
    rand_comment = emoji.emojize(rand_comment, use_aliases=True)

    open_comment_section(browser)
    comment_input = get_comment_input(browser)

    try:
        if len(comment_input) > 0:
            comment_input[0].clear()
            comment_input = get_comment_input(browser)

            browser.execute_script(
                "arguments[0].value = '" + rand_comment + " ';", comment_input[0])
            # An extra space is added here and then deleted.
            # This forces the input box to update the reactJS core
            comment_input[0].send_keys("\b")
            comment_input = get_comment_input(browser)
            comment_input[0].submit()
            update_activity('comments')
            if blacklist['enabled'] is True:
                action = 'commented'
                add_user_to_blacklist(
                    browser, username, blacklist['campaign'], action, logger, logfolder
                )
        else:
            logger.warning('--> Warning: Comment Action Likely Failed:'
                           ' Comment Element not found')
    except InvalidElementStateException:
        logger.info('--> Warning: Comment Action Likely Failed: Probably InvalidElementStateException')

    logger.info("--> Commented: {}".format(rand_comment.encode('utf-8')))
    sleep(2)

    return 1

示例#19

0

显示文件

文件： parser.py 项目： micetti/WhatsAppToPostCard

def parse_message(lines):
    """ Divide the message into its components using a regex
    :param lines: list of lines to parse
    :return: list of tuples containing the different parts of the message
    """
    struct = []
    for line in lines:
        # We convert the emojis to text representation for easier handling
        line = emoji.demojize(line)
        match_message = re.match(MESSAGE_REGEX, line)
        if not match_message:
            continue
        message = match_message.group('message')
        match = re.match(REMOVE_MODIFIERS, message)
        if match:
            message = match.group(1) + '' + match.group(3)
        date = match_message.group('day')
        # We need to change the date from DD/MM/YY to YY/MM/DD for easier sorting
        day = date[0:2]
        month = date[3:5]
        year = date[6:]
        new_date = '{}/{}/{}'.format(year, month, day)
        struct.append((new_date, match_message.group('person'), message))
    return struct

示例#20

0

显示文件

文件： feeder.py 项目： javierhonduco/emoji-ml

    def on_status(self, status):
        # TODO: avoid duplicate tweets

        tweet = status.text
        language = 'en'

        if not status.retweeted and 'RT @' not in tweet:

            try:
                language = lang(tweet)
            except LangDetectException:
                pass

            if language == 'en':
                extracted_emojis = EMOJI_NAMES_PATTERN.findall(
                    emoji.demojize(tweet)
                )
                cleaned_text = EMOJI_NAMES_PATTERN.sub('', tweet)
                for emoji_name in extracted_emojis:
                    cleaned_emoji = emoji_name.replace(':', '')
                    print(cleaned_text)
                    if cleaned_text.strip() == "":
                        redis.rpush('emoji-ml::{}'.format(cleaned_emoji),
                                    cleaned_text)

示例#21

0

显示文件

文件： twitter.py 项目： ccarlile/TwitterScraper.py

    def parse(self, tweet):
        self.stats['totalTweets'] += 1

        #Look for images
        if 'media' in tweet['entities'] or self._is_instagram(tweet):
            self.stats['tweetsWithPictures'] += 1

        #Look for hashtags
        for hashtag in tweet['entities']['hashtags']:
            self._increase_count(hashtag['text'], self.stats['hashtags'])

        #Look for urls
        if tweet['entities']['urls']:
            self.stats['tweetsWithURL'] += 1
            for url in tweet['entities']['urls']:
                netloc = urlparse(url['expanded_url']).netloc
                self._increase_count(netloc, self.stats['urls'])

        #Look for emoji
        emojis = self.emoji_regex.findall(tweet['text'])
        if emojis:
            self.stats['tweetsWithEmoji'] += 1
            for emoji in emojis:
                self._increase_count(demojize(emoji), self.stats['emoji'])

示例#22

0

显示文件

文件： utilities.py 项目： richdevboston/sexism_custom_classifier

 def replace_emojis(self, text):
     return re.sub('::', ': :', emoji.demojize(text))

示例#23

0

显示文件

文件： service_preprocessing_twitter_v1_ACS.py 项目： imperiumzigna/scrm-solutions

def emoji(origin): 
    try:
        import emoji
        s = emoji.demojize(origin)
        s = s.replace('::', ': :')
        lista_texto = s.split()
        print(lista_texto)
        lista_demoj=[]
        for palavra in lista_texto:            
            parada=False
            cont=0
            while not parada:
                for group in EMOJI_CARACTER.items():
                    cont+=1
                    qtd_emojis=EMOJI_CARACTER.__len__()
                    chave=group[0]
                    valor=group[1]     
                    if chave != palavra:
                        if chave in palavra:
                            palavra=palavra.split(chave)
                            palavra=''.join(palavra)
                            lista_demoj.append(palavra)
                            lista_demoj.append(valor)
                            #print(lista_demoj)
                            #demoj=''.join(lista_demoj)
                            parada=True
                            break
                        else:
                            if palavra in lista_demoj:
                                parada=True
                                break
                            elif palavra==chave:
                                lista_demoj.append(valor)
                                parada=True
                                break
                            elif chave not in palavra and cont <= qtd_emojis:
                                continue
                            else:        
                                lista_demoj.append(palavra)
                                #demoj=''.join(lista_demoj)
                                parada=True
                                break    
                        #print(lista_demoj)
                        #demoj=''.join(lista_demoj)
                        #print(demoj)        
                    else:
                        lista_demoj.append(valor)
                        #print(lista_demoj)
                        #demoj=''.join(lista_demoj)  
                        parada=True
                        break          
        demoj=' '.join(lista_demoj)
        print(origin)
        print(demoj)
        if demoj == origin:
            demoj=None
            return demoj
        else:   
            return demoj
    except Exception as e:
        print(e)

示例#24

0

显示文件

sock.settimeout(120)
sock.connect((server, port))
sock.send(f'PASS {oauth}\n'.encode('utf-8'))
sock.send(f'NICK {nickname}\n'.encode('utf-8'))
sock.send(f'JOIN #{channel}\n'.encode('utf-8'))

count = 0.0
rate = 0.0

start_time = time.time()

while True:
    resp = sock.recv(2048).decode('utf-8')
    if resp.startswith('PING'):
        sock.send('PONG\n'.encode('utf-8'))
        #logging.info(resp)

    elif len(resp) > 0:
        logging.info(demojize(resp))

    if (count % 4 == 0):
        #time.sleep(0.1)
        end_time = time.time()
        rate = 4 / (end_time - start_time)
        start_time = end_time

    logging.info(f"Calls = {count}, Messages Sent Per Second = {rate}")
    count += 1

sock.close()

示例#25

0

显示文件

# 링크들을 전부 돌아다니면서 정보수집
for link in target_links:
    # 타겟 데이터 찾고 필요한 모양으로 전처리
    driver.get(link)
    driver.implicitly_wait(5)
    time.sleep(1)
    post_date = driver.find_elements_by_tag_name('time')[-1].get_attribute("datetime").split('T')[0]
    try:
        number_of_like = driver.find_element_by_class_name('Nm9Fw').text.split(' ')[1][:-1].replace(',','')
    except:
        number_of_like = driver.find_element_by_class_name('vcOH2').text.split(' ')[1][:-1].replace(',', '')
    posting = driver.find_elements_by_class_name('C4VMK')[0].text.split('\n')[2:-1]
    posting_text = ''
    for text in posting:
        text = emoji.demojize(text)
        text = text.replace("'", '')
        text = text.replace('"', '')
        posting_text = posting_text + ' ' + text
    # print(date, number_of_like, posting)

    # DB에 데이터 저장
    query_for_insert_data = f"INSERT INTO postings(link, post_date, number_of_like, posting) VALUES ('{link}', '{post_date}', {number_of_like}, '{posting_text}');"
    try:
        cursor.execute(query_for_insert_data)
    except:
        print(link)
        print(query_for_insert_data)
    connection.commit()
    # break

示例#26

0

显示文件

def test_demojize_name_only_no_space():
    for name in emoji.EMOJI_UNICODE.keys():
        oneway = emoji.emojize(name, False, True)
        roundtrip = emoji.demojize(oneway, True)
        assert name == roundtrip, "%s != %s" % (name, roundtrip)

示例#27

0

显示文件

def test_shortcut_translation():
    for shortcut in emoji.shortcuts.SHORTCUTS.keys():
        actual = emoji.demojize(shortcut, use_shortcuts=True)
        assert actual != shortcut
        expected = emoji.shortcuts.SHORTCUTS[shortcut]
        assert expected == actual, "%s != %s" % (expected, actual)

示例#28

0

显示文件

def unicode_to_name(e):
    return emoji.demojize(e.name)

示例#29

0

显示文件

def echo(update,context): 
    

    bot             = context.bot
    
    chat            = update.effective_chat  # type: Optional[Chat]
    # user            = update.effective_user  # type: Optional[User]
    message         = update.effective_message  # type: Optional[Message]    
    chat_id         = message.chat.id
    chat_type       = message.chat.type
    message_id      = message.message_id
    from_user_name  = message.from_user.username
    from_user_id    = message.from_user.id    
    member          = chat.get_member(from_user_id)
    # date            = message.date
    # try:
    #     message         = update.effective_message.reply_to_message  # type: Optional[Message]    
    #     audio       = message.audio
    #     document    = message.document
    #     animation   = message.animation
    #     photo       = message.photo
    #     sticker     = message.sticker        
    #     video       = message.video
    #     voice       = message.voice
    #     video_note  = message.video_note
    #     contact     = message.contact
    #     pprint.pprint(message.to_dict())
        
    #     if audio is not None:
    #         media       = audio['file_id']   
    #         tipe        = "audio"
    #         image_size  = "0x0"
    #         thumb_id    = ""
    #     elif document is not None:            
    #         media       = document['file_id']
    #         thumb_id    = document['thumb']['file_id']
    #         tipe        = "document"
    #         width       = animation['thumb']['width']
    #         height      = animation['thumb']['height']
    #         image_size  = "%sx%s"%(width,height)
    #     elif animation is not None:            
    #         media       = animation['file_id']
    #         thumb_id    = animation['thumb']['file_id']
    #         tipe        = "animation"
    #         width       = animation['thumb']['width']
    #         height      = animation['thumb']['height']
    #         image_size  = "%sx%s"%(width,height)
    #     elif len(photo) != 0:            
    #         media       = photo[0]['file_id']
    #         thumb_id    = photo[-1].file_id
    #         tipe        = "photo"
    #         width       = photo[-1].width
    #         height      = photo[-1].height
    #         image_size  = "%sx%s"%(width,height)
    #     elif sticker is not None:            
    #         media       = sticker['file_id']
    #         thumb_id    = sticker['thumb']['file_id']
    #         tipe        = "sticker"
    #         width       = sticker['thumb']['width']
    #         height      = sticker['thumb']['height']
    #         image_size  = "%sx%s"%(width,height)
    #     elif video is not None:            
    #         media       = video['file_id']
    #         thumb_id    = video['thumb']['file_id']
    #         tipe        = "video"
    #         width       = video['thumb']['width']
    #         height      = video['thumb']['height']
    #         image_size  = "%sx%s"%(width,height)
    #     elif voice is not None:            
    #         media       = voice['file_id']
    #         tipe        = "voice"
    #         image_size  = "0x0"
    #         thumb_id    = ""
    #     elif video_note is not None:            
    #         media       = video_note['file_id']
    #         tipe        = "video_note"
    #     elif contact is not None:            
    #         media       = contact['vcard']
    #         tipe        = "contact"
    #         image_size  = "0x0"
    #         thumb_id    = ""
    #     keyword = update.effective_message.text
    #     print (tipe,keyword)
    #     # pprint.pprint (update.message.to_dict())
    #     sqlUpdate = "UPDATE media SET thumb_id = ?, image_size = ? WHERE media_keyword = ? AND chat_id = '-1001162202776'"
    #     cur.execute(sqlUpdate, (thumb_id, image_size, keyword))
    #     db.commit()
    # except Exception as e:
    #     print (e)
    
    lock.acquire(True)
    try:        
        sql             = "SELECT english_day FROM setting WHERE chat_id = '%s'"%chat_id
        bar, jum        = eksekusi(sql)
        if jum == 0:
            pass
        else:               
            try:                   
                translator = Translator()                
                try:
                    message = re.sub(r"(?:\@|https?\://)\S+", "", message.text.encode().decode('utf-8'))                    
                except:
                    if not message.caption:
                        return
                    elif message.caption ==None:
                        message = "this is caption"
                    else:
                        message = re.sub(r"(?:\@|https?\://)\S+", "", message.caption.encode().decode('utf-8'))
                        # message = message.caption.encode('ascii', 'ignore').decode('ascii')
                message = re.sub(r'".*?"', "", message)
                message = re.sub(r'/.*', "", message)
                message = re.sub(r"\b[A-Z\.]{2,}s?\b", "", message)
                try:
                    a           = translator.detect(emoji.demojize(message)).lang
                    sekarang    = datetime.datetime.now()
                    tanggal     = '{:%Y-%m-%d}'.format(sekarang)
                    hari        = datetime.datetime.strftime(sekarang.date(),"%a")
                    if hari == bar[0][0] and a != 'en':
                        cek = "SELECT user_id, mute FROM blacklist WHERE chat_id = '%s' AND user_id = '%s' AND tanggal = '%s'"%(chat_id,from_user_id,tanggal)
                        bar, jum = eksekusi(cek)
                        if jum == 0:
                            infut = "INSERT INTO blacklist (chat_id, chat_type, user_id, user_name, mute,tanggal) VALUES ('%s','%s','%s','%s',0,'%s')"%(chat_id, chat_type, from_user_id, from_user_name,tanggal)
                            cur.execute(infut)
                            db.commit()
                            bot.send_message(chat_id,  random.choice(teks), reply_to_message_id=message_id)
                        elif jum != 0 and bar[0][1] < 3:
                            infut = "UPDATE blacklist SET mute = mute+1 WHERE chat_id = '%s' AND user_id = '%s' AND tanggal = '%s'"%(chat_id, from_user_id,tanggal)
                            cur.execute(infut)
                            db.commit()
                            sisa = 2-bar[0][1]
                            if sisa == 0:
                                if member.status == 'administrator' or member.status == 'creator':
                                    bot.send_message(
                                        chat_id, 
                                        'Your next-non-english chat will be deleted.', 
                                        reply_to_message_id=message_id)
                                else:
                                    bot.send_message(
                                        chat_id, 
                                        'Your next-non-english chat will make you muted to this group for 24 hours.', 
                                        reply_to_message_id=message_id)
                            else:
                                bot.send_message(
                                        chat_id, 
                                        'You have %s remaining'%(sisa), 
                                        reply_to_message_id=message_id)
                        elif jum!=0 and bar[0][1]==3:
                            if member.status == 'administrator' or member.status == 'creator':
                                try:
                                    update.effective_message.delete()
                                except:
                                    bot.send_message(
                                        chat_id, 
                                        'Gak bisa di delete nih', 
                                        reply_to_message_id=message_id)
                            elif member.can_send_messages is None or member.can_send_messages:
                                mutetime    = datetime.datetime.now()+datetime.timedelta(hours=24)
                                tanggalmute = sekarang = '{:%Y-%m-%d %H:%M:%S}'.format(mutetime)
                                infut       = "UPDATE blacklist SET mute_sampe_tanggal = '%s' WHERE chat_id = '%s' AND user_id = '%s' AND tanggal = '%s'"%(tanggalmute,chat_id, from_user_id,tanggal)
                                cur.execute(infut)
                                db.commit()
                                bot.restrict_chat_member(chat_id, from_user_id, until_date=mutetime, can_send_messages=False)
                                bot.send_message(chat_id, "Restricted until {}!".format(tanggalmute), reply_to_message_id=message_id) 
                            else:
                                bot.send_message(chat_id, "Already muted.", reply_to_message_id=message_id)
                except:
                    bot.send_message(chat_id,  "Im stupid bot", reply_to_message_id=message_id)
            except:
                bot.send_message(chat_id,str(traceback.format_exc()), reply_to_message_id=message_id)
    finally:
        lock.release()

示例#30

0

显示文件

文件： index.py 项目： ashitadiwan/whatsapp-chat-migrator

    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try:
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False


# Iterate through all the files and send messages in whatsapp
for file in os.listdir(chatsFolder):
    filename = 'B-{}'.format(str(file[19:-4]))
    whatsapp.selectContact(filename.strip())
    # Strips the newline character from the end of message
    message_file = open(chatsFolder + file, 'r')
    Lines = message_file.readlines()
    message = ""
    for line in Lines:
        if is_date(line[0:8]):
            whatsapp.sendMessage(emoji.demojize(message, delimiters=("", "")))
            message = line
        else:
            message = message + line

    os.rename(chatsFolder + file, restoredFolder + filename)

示例#31

0

显示文件

文件： post-scraper-addon.py 项目： madelinekinnaird/quantifying-greenwashing

def add_more_posts(companies, addDirection, addDate):
    for company in companies:
        ## import company csv as dataframe
        csvName = company + '.csv'
        output_path = pathlib.Path('../../../data/all_instagram_posts')
        df = pd.read_csv(output_path.joinpath(csvName))

        ## get earliest and latest date
        oldestDate = pd.to_datetime(df['date_utc'].min()) ## earliest date in data
        recentDate = pd.to_datetime(df['date_utc'].max()) ## most recent date in data

        if addDirection == 'beginning':
            SINCE = oldestDate
            UNTIL = addDate

        if addDirection == 'end':
            SINCE = addDate
            UNTIL = recentDate

        posts = instaloader.Profile.from_username(instagram.context, company).get_posts()

        processed = 1
        for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, posts)):

            print(post.date)
            print("...scraping info for post %i, %s" % (processed, company))


            post_info = {
                "shortcode": post.shortcode,
                "username": company,
                "date_utc": post.date_utc.strftime('%Y-%m-%d %H:%M:%S.%f'),
                "is_video": "yes" if post.is_video else "no",
                "is_sponsored": post.is_sponsored,
                "hashtags": (",".join(post.caption_hashtags)).encode('utf-8', errors='ignore'),
                "mentions": (",".join(post.caption_mentions)).encode('utf-8', errors='ignore'),
                "caption": (emoji.demojize(post.caption)).encode('utf-8', errors='ignore') if post.caption else "",
                "video_view_count": post.video_view_count if post.is_video else 0,
                "video_length": post.video_duration if post.is_video else 0,
                "likes": post.likes,
                "comments": post.comments,
                "location_name": (post.location.name).encode('utf-8', errors='ignore') if post.location else "",
                "location_latlong": " ".join((str(post.location.lat), str(post.location.lng))) if post.location else ""
                }

            processed += 1


            file_path = os.path.join(output_path, csvName)

            fieldnames=["shortcode", "username", "date_utc", "is_video",
             "is_sponsored", "hashtags", "mentions", "caption", "video_view_count",
             "video_length", "likes", "comments", "location_name", "location_latlong"]



            #bigdict = {'column_1': 1, 'column_2': 2, 'column_3': 3}

            with open(file_path, 'a+') as csv_file:
                #fieldnames = ['column_1', 'column_2', 'column_3']
                writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',')
                #if '\n' not in csv_file.readlines()[-1]:
                #    csv_file.write("\n")
                writer.writerow(post_info)





        print("...scraped %i posts for %s" % (processed - 1, company))
    print("Done scraping!")

示例#32

0

显示文件

"""
 @author    : macab (macab@debian)
 @file      : emoji
 @created   : Wednesday Mar 20, 2019 23:05:24 IST
"""

import emoji

if __name__ == "__main__":

    # grinning face
    print("\U0001f600")

    # grinning squinting face
    print("\U0001F606")

    # rolling on the floor laughing
    print("\U0001F923")

    print(emoji.emojize(":grinning_face_with_big_eyes:"))
    print(emoji.demojize('😍'))

示例#33

0

显示文件

文件： cleaners.py 项目： dpescado20/project_sentiment_analysis

    def clean_tweet(self, text):
        # FIXED UNICODE
        # text = preprocess.fix_bad_unicode(text)
        text = ftfy.fix_text(text)

        # GET TEXT ONLY FROM HTML
        text = BeautifulSoup(text, features='lxml').getText()

        # UN-PACK CONTRACTIONS
        text = preprocess.unpack_contractions(text)

        # REMOVE URL
        # text = preprocess.replace_urls(text)
        text = preprocessing.replace_urls(text)

        # REMOVE EMAILS
        # text = preprocess.replace_emails(text)
        text = preprocessing.replace_emails(text)

        # REMOVE PHONE NUMBERS
        # text = preprocess.replace_phone_numbers(text)
        text = preprocessing.replace_phone_numbers(text)

        # REMOVE NUMBERS
        # text = preprocess.replace_numbers(text)
        text = preprocessing.replace_numbers(text)

        # REMOVE CURRENCY
        # text = preprocess.replace_currency_symbols(text)
        text = preprocessing.replace_currency_symbols(text)

        # REMOVE ACCENTS
        # text = preprocess.remove_accents(text)
        text = preprocessing.remove_accents(text)

        # CONVERT EMOJIS TO TEXT
        words = text.split()
        reformed = [
            self.SMILEY[word] if word in self.SMILEY else word
            for word in words
        ]
        text = " ".join(reformed)
        text = emoji.demojize(text)
        text = text.replace(":", " ")
        text = ' '.join(text.split())

        # SPLIT ATTACHED WORDS
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text))

        # SPLIT UNDERSCORE WORDS
        text = text.replace('_', ' ')

        # REMOVE PUNCTUATION
        # text = preprocess.remove_punct(text)
        text = preprocessing.remove_punctuation(text)

        # Remove numbers
        text = re.sub(r'\d', '', text)

        # REMOVE WORDS LESS THAN 3 CHARACTERS
        text = re.sub(r'\b\w{1,2}\b', '', text)

        # NORMALIZE WHITESPACE
        # text = preprocess.normalize_whitespace(text)
        text = preprocessing.normalize_whitespace(text)

        return text

示例#34

0

显示文件

def clean_text(val):
    val = misspelled_correction(val)
    val = p.clean(val)
    val = ' '.join(punctuation(emoji.demojize(val)).split())

    return val

示例#35

0

显示文件

    punctuations = '''()-[]{};:'"\,<>./@#$%^&_~'''

    for x in val.lower():
        if x in punctuations:
            val = val.replace(x, " ")
    return val


# In[8]:

punctuation("test ombak@ #ldfldlf??? !! ")

# In[9]:

data.clean_content = data.clean_content.apply(
    lambda x: ' '.join(punctuation(emoji.demojize(x)).split()))

# In[10]:


def clean_text(val):
    val = misspelled_correction(val)
    val = p.clean(val)
    val = ' '.join(punctuation(emoji.demojize(val)).split())

    return val


# In[11]:

clean_text("saya punya ideðŸ’¡ bag00ss@@ ! ? ")

示例#36

0

显示文件

文件： emoji_daily_counts.py 项目： hopeemac/Endorsements_Data

     json.dump(e_codes_json, f)


# In[ ]:

#def get_emoji_counts(master):
    emoji_counts = {}
    for i in range(0,len(master)):
        tweet = master.loc[i,'statusText']
        date = master.loc[i,'statusCreatedAt']
        date = datetime.datetime.strptime(date,'%a %b %d %H:%M:%S %Z %Y')
        date_ft = date.strftime('%Y_%m_%d')

        tokens = twtokenizer.tokenize(tweet)
        cleanWords = [word for word in cleanWords if word[0:3] != 'htt']
        tokens = [emoji.demojize(token) for token in tokens if token != ':']
        # tokens = [word for word in tokens if word not in string.punctuation]

        for token in tokens:
            if re.match(':+*:',token):
                if date_ft not in emoji_counts.keys():
                    emoji_counts[date_ft] = {}
                if token in emoji_counts[date_ft]:
                    emoji_counts[date_ft][token] +=1
                else:
                    emoji_counts[date_ft][token] = 1
    return emoji_counts


# In[ ]:

示例#37

0

显示文件

文件： test_core.py 项目： beekpr/emoji

def test_smile_emoji2():
    txt = u'(test asdad :smile:)'
    assert emoji.demojize(txt, use_shortcuts=True) == u'(test asdad :smile:)'

示例#38

0

显示文件

文件： Main_control.py 项目： gladcolor/face-detection-face-

def transTwts(configs, dest):
    print('transTwts() started. ')
    try:

        #global database_paras, con
        sql_twts = db_op.SQL_tweets()

        host = configs['database']['host'].replace('"', '')
        user = configs['database']['user'].replace('"', '')
        password = configs['database']['password'].replace('"', '')
        db = configs['database']['db'].replace('"', '')

        db_info_list = []
        db_info_list.append(host)
        db_info_list.append(user)
        db_info_list.append(password)
        db_info_list.append(db)

        db_info_str = ','.join(db_info_list)

        database_paras = db_op.Database_parameters(host, user, password, db)


        #select_sql = r"SELECT tid, text  FROM tweets.tweet where tweet_lang <> 'en' order by tid desc limit 50;"
        #select_sql =  r"SELECT tid, text, url1  FROM tweets.tweet where tweet_lang = 'ar' order by tid desc   limit 1 ;"
        select_sql = r"SELECT tid, text, url1, tweet_lang  FROM tweets.tweet  order by tid desc   limit 1 ;"

        translate = Translator()

        while True:

            con = sql_twts.connect2database(database_paras)
            #results = sql_twts.select_db(r'SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED ;', con)
            results = sql_twts.select_db(select_sql, con)
            con.close()
            #print('results in transTwts: ', results)

            if len(results) < 1:
                time.sleep(10)
                continue
            df = pd.DataFrame(list(results), columns=['tweetID', 'text', 'url1', 'tweet_lang'])
            df = df.sort_values(by=['tweetID'], ascending=False)
            df['tweetID'] = df['tweetID'].astype(str)
            i = int(random.random() * len(df))
            #print('df: ', df.ix[0, 'url1'])
            #print('i: ', i)
            #df = df.iloc[i]
            texts = list(df['text'])
            #urls = list(df['url1'])

            #texts = texts[i:i+1] # randomly get 1 tweet
            #print('texts: ', texts)
            for j in range(len(texts)):
                texts[j] = re.sub(r'https{0,1}:\/\/t.co\/[a-zA-Z0-9]+', '', texts[j])
                texts[j] = re.sub(r'#', '', texts[j])
                texts[j] = re.sub(r'@[a-zA-Z0-9_]+', '', texts[j])
                #RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
                #texts[j] = RE_EMOJI.sub(r'', texts[j])
                texts[j] = emoji.demojize(texts[j])

            #print('texts: ', texts)
            trans = translate.translate(texts, dest)
            traneEn = translate.translate(texts, 'en')
            translateds = [i.text for i in trans]
            translateds_En = [i.text for i in traneEn]

            #url_list = url1.split(";")

            for j in range(len(texts)):
                #print(r'df[text]: ', df.ix[i, 'text'])   # restore the random tweet
                lang = df.ix[j, 'tweet_lang'].strip()
                try:
                    lang_full = LANGUAGES[lang].capitalize()
                except:
                    lang_full = 'Unknown'
                print(r'Tweets translation (original language is {}): {} {}'.format(lang_full, df.ix[j, 'text'], df.ix[j, 'url1'].replace(';', '  ')))
                #print(r'df[text]: ', df.ix[j, 'url1'].replace(';', '  '))
                print('Tweets translation (English): ', translateds_En[j])
                print('Tweets translation (Chinese): ', translateds[j])
                print('')

            # print(r'df[text]: ', df['text'])
            # print('translateds: ', translateds)

            time.sleep(20)
            #print('Translation(text): {} , {}'.format(trans.text, text))


            #
            # sql_cls = db_op.SQL_tweets()
            # images_ID = list(df['tweetID'])
            #
            # classified = list(df['classified'])
            # con = sql_twts.connect2database(database_paras)
            #
            # if len(labels) > 0:
            #     # tried_url = 3 : the tweet images have been classified.
            #     # print('Probs: ', type(probs))
            #     # print('Probs: ', probs)
            #     sql_cls.update_rows('tweet', ['tid', 'Flooded', 'classified', 'tried_url', 'Flooded_prob'],
            #                         [images_ID, flooded, classified, [3] * len(images_ID), probs], db_info_str)
            #     print('labels: ', labels)
            #
            #
            # con.close()
    #

    except Exception as e:
        print("Error in transTwts(): ", str(e))
        time.sleep(10)
        transTwts(configs, dest)

示例#39

0

显示文件

def test_smile_emoji2():
    txt = u'(test asdad :smile:)'
    assert emoji.demojize(txt, use_shortcuts=True) == u'(test asdad :smile:)'

示例#40

0

显示文件

文件： test_core.py 项目： beekpr/emoji

def test_shortcuts():
    assert emoji.demojize(u'\U0001F376 :S :S :S', no_space=True, use_shortcuts=True) == u':sake_bottle_and_cup: :confounded: :confounded: :confounded:'

示例#41

0

显示文件

def test_shortcuts():
    assert emoji.demojize(
        u'\U0001F376 :S :S :S', no_space=True, use_shortcuts=True
    ) == u':sake_bottle_and_cup: :confounded: :confounded: :confounded:'

示例#42

0

显示文件

文件： emojinal.py 项目： sprax/python

if __name__ == '__main__':
    trans()
    print(u'\U0001f604'.encode('unicode-escape'))
    print(u'\U0001f604')
    ss = u'\U0001f604'
    #xx = chr(ss[0])
    #print("ss({}) xx({})".format(ss, xx))
    # -*- coding: UTF-8 -*-
    #convert to unicode
    teststring =  "I am happy \U0001f604"
    # #teststring = unicode(teststring, 'utf-8')

    #encode it with string escape
    teststring = teststring.encode('unicode_escape')
    print("💗 Growing Heart")
    print(emoji.emojize('Water! :water_wave:'))
    print(emoji.demojize(u'🌊')) # for Python 2.x
# print(emoji.demojize('🌊')) # for Python 3.x.
    print(u"And \U0001F60D")
    print("(-woman) astronaut", chr(int("0001f680", 16)))
    print("woman_astronaut", chr(int("0x0001f680", 0)))

    print("\U0001f483\U0001f3fe")

    print(chr(0x001f483),chr(0x001f3fe))
    print('💃 🏾 ')
    print(chr(0x001f483)+chr(0x001f3fe))
    print('💃🏾 ')
    print(chr(int('1f483',16))+chr(int('1f3fe',16)))

示例#43

0

显示文件

def test_demojize_complicated_string():
    constructed = u"testing :baby::emoji_modifier_fitzpatrick_type-3: with :eyes: :eyes::eyes: modifiers :baby::emoji_modifier_fitzpatrick_type-5: to symbols ヒㇿ"
    emojid = emoji.emojize(constructed)
    destructed = emoji.demojize(emojid)
    assert constructed == destructed, "%s != %s" % (constructed, destructed)

示例#44

0

显示文件

文件： ebay_worker.py 项目： DKSA1/ana_report_server

async def ebay_handle(group, task):
    hy_task = ANATask(task)
    task_log = [hy_task.task_type, hy_task.task_data]
    # logger.info("connecting")
    task = hy_task.task_data
    time_now = (datetime.now() + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')
    with engine.connect() as conn:

        del_body = delete(ebay_product_report_result).where(
            ebay_product_report_result.c.task_id == task['task_id'],
        )

        conn.execute(del_body)

        try:
            es = ESBody()
            # # 逐个任务完成查询es写入db
            search_body = es.create_search(task)
            search_body = await get_permission_es_body(task['user_id'], search_body, task['site'])

            logger.info("========================es请求体================================")
            logger.info(json.dumps(search_body))
            logger.info("========================es请求体================================")

            es_connection = Elasticsearch(hosts=EBAY_ELASTICSEARCH_URL, timeout=ELASTIC_TIMEOUT)

            index_result = await es_connection.search(
                index=task['index_name'],
                body=search_body,
                size=task['result_count'])
            # logger.info(index_result)
            # 报告商品结果列表
            the_es_result = index_result['hits']['hits']
            name_ids = []
            # 构造品类IDS
            for item in the_es_result:
                # logger.info(item)
                for category_id in item['_source']['leaf_category_id']:
                    name_ids.append(category_id)
            # 查出category_path
            select_category_name = select([
                ebay_category.c.category_name,
                ebay_category.c.category_id,
                ebay_category.c.category_id_path,
                ebay_category.c.category_name_path
            ]).where(
                and_(
                    ebay_category.c.category_id.in_(name_ids),
                    ebay_category.c.site == task['site']
                ))
            cursor_name = conn.execute(select_category_name)
            records_name = cursor_name.fetchall()
            logger.info("=======补全category_path的id========")
            logger.info(name_ids)
            logger.info("===============")
            # 生成类目path
            for db_info in records_name:
                for category in the_es_result:
                    for low_id in category['_source']['leaf_category_id']:
                        # logger.info(low_id)
                        if low_id == db_info['category_id']:
                            name_list = db_info['category_name_path'].split(':')
                            id_list = db_info['category_id_path'].split(':')
                            complete_list = []
                            category['_source']['category_path'] = []
                            try:
                                for i in range(3):
                                    complete_list.append({"name": name_list.pop(0), "id": id_list.pop(0)})
                                category['_source']['category_path'].append(complete_list)
                            except Exception as e:
                                logger.info(e)
                                category['_source']['category_path'].append(complete_list)

            # 逐个商品更新db
            get_result_count = 0
            sum_data = {
                "sold_total": 0,
                "sum_sold_last_3": 0,
                "sum_sold_last_7": 0,
                "sum_sold_last_1": 0,
                "sum_gmv_last_3": 0,
                "sum_gmv_last_7": 0,
                "sum_gmv_last_1": 0
            }
            for item in the_es_result:
                # 构造商品dict
                sum_data['sold_total'] += item['_source']['sold_total']
                sum_data['sum_sold_last_3'] += item['_source']['sold_last_3']
                sum_data['sum_sold_last_7'] += item['_source']['sold_last_7']
                sum_data['sum_sold_last_1'] += item['_source']['sold_last_1']
                sum_data['sum_gmv_last_3'] += item['_source']['gmv_last_3']
                sum_data['sum_gmv_last_7'] += item['_source']['gmv_last_7']
                sum_data['sum_gmv_last_1'] += item['_source']['gmv_last_1']
                result_info = {
                    "task_id": task['task_id'],
                    "item_id": item['_source']['item_id'],
                    "img": item['_source']['img'],
                    "title": emoji.demojize(item['_source']['title']),
                    "site": item['_source']['site'],
                    "brand": item['_source']['brand'],
                    # 需要构造
                    "category_path": str(item['_source']['category_path']),
                    "store_location": item['_source']['store_location'],
                    "item_location": item['_source']['item_location'],
                    "item_location_country": item['_source']['item_location_country'],
                    "seller": item['_source']['seller'],
                    "price": item['_source']['price'],
                    "gmv_last_3_pop": item['_source']['gmv_last_3_pop'],
                    "gmv_last_3": item['_source']['gmv_last_3'],
                    "gmv_last_1": item['_source']['gmv_last_1'],
                    "gmv_last_7": item['_source']['gmv_last_7'],
                    "sold_last_7": item['_source']['sold_last_7'],
                    "sold_last_1": item['_source']['sold_last_1'],
                    "sold_last_3": item['_source']['sold_last_3'],
                    "visit": item['_source']['visit_last_1'],
                    "cvr": item['_source']['sold_last_1'] / item['_source']['visit_last_1'] if item['_source'][
                                                                                                   'visit_last_1'] != 0 else 0,
                    "date": (datetime.now()).strftime('%Y-%m-%d %H:%M:%S'),
                    "update_time": time_now
                }
                # logger.info(result_info)

                # 插入商品信息

                ins = insert(ebay_product_report_result)
                insert_stmt = ins.values(result_info)
                on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
                    task_id=insert_stmt.inserted.task_id,
                    item_id=insert_stmt.inserted.item_id,
                    img=insert_stmt.inserted.img,
                    title=insert_stmt.inserted.title,
                    site=insert_stmt.inserted.site,
                    brand=insert_stmt.inserted.brand,
                    seller=insert_stmt.inserted.seller,
                    price=insert_stmt.inserted.price,
                    category_path=insert_stmt.inserted.category_path,
                    store_location=insert_stmt.inserted.store_location,
                    item_location=insert_stmt.inserted.item_location,
                    item_location_country=insert_stmt.inserted.item_location_country,
                    gmv_last_3_pop=insert_stmt.inserted.gmv_last_3_pop,
                    gmv_last_3=insert_stmt.inserted.gmv_last_3,
                    gmv_last_1=insert_stmt.inserted.gmv_last_1,
                    gmv_last_7=insert_stmt.inserted.gmv_last_7,
                    sold_last_7=insert_stmt.inserted.sold_last_7,
                    sold_last_1=insert_stmt.inserted.sold_last_1,
                    sold_last_3=insert_stmt.inserted.sold_last_3,
                    visit=insert_stmt.inserted.visit,
                    cvr=insert_stmt.inserted.cvr,
                    date=insert_stmt.inserted.date,
                )
                result = conn.execute(on_duplicate_key_stmt)
                # logger.info(result)
                get_result_count += 1

            # 更新任务状态
            logger.info(sum_data)
            ins = update(ebay_custom_report_task)
            ins = ins.values({
                "status": 1,
                "update_time": time_now,
                "get_result_count": get_result_count,
                "product_total": get_result_count,
                "sold_total": sum_data['sold_total'],
                "sum_sold_last_3": sum_data['sum_sold_last_3'],
                "sum_sold_last_7": sum_data['sum_sold_last_7'],
                "sum_sold_last_1": sum_data['sum_sold_last_1'],
                "sum_gmv_last_3": round(sum_data['sum_gmv_last_3'], 2),
                "sum_gmv_last_7": round(sum_data['sum_gmv_last_7'], 2),
                "sum_gmv_last_1": round(sum_data['sum_gmv_last_1'], 2)
            }).where(
                ebay_custom_report_task.c.task_id == task['task_id']
            )
            result = conn.execute(ins)
            # logger.info(result)
            # 添加消息通知
            ins_msg = insert(ana_user_msg)
            insert_stmt_msg = ins_msg.values(
                {
                    "user_id": task['user_id'],
                    "msg_id": str(task['user_id']) + str(int(time.time())),
                    "msg_content": "您的Ebay自定义报告" + task['report_name'] + "于" +
                                   time_now + "生成成功,请及时查看!",
                    "create_at": time_now,
                    "status": 0
                }
            )
            result_msg = conn.execute(insert_stmt_msg)
        except Exception as e:
            logger.info(e)
            # 更新任务状态
            ins = update(ebay_custom_report_task)
            ins = ins.values({
                "status": 2,
                "update_time": time_now,
                # "get_result_count": get_result_count,
                # "product_total": get_result_count,
                # "sold_total": sum_data['sold_total'],
                # "sum_sold_last_3": sum_data['sold_last_3'],
                # "sum_sold_last_7": sum_data['sold_last_7'],
                # "sum_sold_last_1": sum_data['sold_last_1'],
                # "sum_gmv_last_3": round(sum_data['gmv_last_3'], 2),
                # "sum_gmv_last_7": round(sum_data['gmv_last_7'], 2),
                # "sum_gmv_last_1": round(sum_data['gmv_last_1'], 2)
            }).where(
                ebay_custom_report_task.c.task_id == task['task_id']
            )
            result = conn.execute(ins)
            # logger.info(result)
            # 添加消息通知
            ins_msg = insert(ana_user_msg)
            insert_stmt_msg = ins_msg.values(
                {
                    "user_id": task['user_id'],
                    "msg_id": str(task['user_id']) + str(int(time.time())),
                    "msg_content": "您的Ebay自定义报告" + task['report_name'] + "于" +
                                   time_now + "生成失败,请重新编辑条件或联系网站管理员!",
                    "create_at": time_now,
                    "status": 0
                }
            )

示例#45

0

显示文件

def form_emoji_dict(s):
    emo = emoji.demojize(' '.join(c for c in s if c in emoji.UNICODE_EMOJI))
    emoji_chain.append(emo)

示例#46

0

显示文件

def comment_image(browser, username, comments, blacklist, logger, logfolder):
    """Checks if it should comment on the image"""
    # check action availability
    if quota_supervisor("comments") == "jump":
        return False, "jumped"

    rand_comment = random.choice(comments).format(username)
    rand_comment = emoji.demojize(rand_comment)
    rand_comment = emoji.emojize(rand_comment, use_aliases=True)

    open_comment_section(browser, logger)
    # wait, to avoid crash
    sleep(3)
    comment_input = get_comment_input(browser)

    try:
        if len(comment_input) > 0:
            # wait, to avoid crash
            sleep(2)
            comment_input = get_comment_input(browser)
            # below, an extra space is added to force
            # the input box to update the reactJS core
            comment_to_be_sent = rand_comment

            # wait, to avoid crash
            sleep(2)
            # click on textarea/comment box and enter comment
            (ActionChains(browser).move_to_element(
                comment_input[0]).click().send_keys(
                    comment_to_be_sent).perform())
            # wait, to avoid crash
            sleep(2)
            # post comment / <enter>
            (ActionChains(browser).move_to_element(comment_input[0]).send_keys(
                Keys.ENTER).perform())

            update_activity(
                browser,
                action="comments",
                state=None,
                logfolder=logfolder,
                logger=logger,
            )

            if blacklist["enabled"] is True:
                action = "commented"
                add_user_to_blacklist(username, blacklist["campaign"], action,
                                      logger, logfolder)
        else:
            logger.warning("--> Comment Action Likely Failed!"
                           "\t~comment Element was not found")
            return False, "commenting disabled"

    except InvalidElementStateException:
        logger.warning("--> Comment Action Likely Failed!"
                       "\t~encountered `InvalidElementStateException` :/")
        return False, "invalid element state"

    logger.info("--> Commented: {}".format(rand_comment.encode("utf-8")))
    Event().commented(username)

    # get the post-comment delay time to sleep
    naply = get_action_delay("comment")
    sleep(naply)

    return True, "success"

示例#47

0

显示文件

    def onPressButton(self):
        button = self.user.message.text
        logger.info('{} - нажата кнопка {}'.format(self.user.message.chat.id,
                                                   emoji.demojize(button)))
        if button == self.profileButton:
            markup = telebot.types.InlineKeyboardMarkup()
            markup.add(
                telebot.types.InlineKeyboardButton(
                    text='🛒 Мои покупки', callback_data='my_purchases'))
            markup.add(
                telebot.types.InlineKeyboardButton(text='💰 Мои продажи',
                                                   callback_data='my_sales'))
            markup.add(
                telebot.types.InlineKeyboardButton(
                    text='🤝 Пригласить друга', callback_data='invite_message'))
            markup.add(
                telebot.types.InlineKeyboardButton(
                    text='🏷 Ввести код купона',
                    callback_data='enter_coupon_code'))
            bot.send_message(self.user.message.chat.id,
                             '₴ Баланс: {}\n'
                             '🛒 Покупок: {}\n'
                             '💰 Продаж: {}'.format(
                                 self.user.balance,
                                 db.get_purchases(self.user.id),
                                 db.get_sells(self.user.id)),
                             reply_markup=markup)
        elif button == self.buyButton:
            #тут переход на другую страницу
            self.user.setState('shop')

            if db.get_selling_products():
                bot.send_message(self.user.message.chat.id,
                                 'Товары в продаже:',
                                 parse_mode='HTML',
                                 reply_markup=Page(self.user).getMarkup())
                for product in db.get_selling_products():
                    text = '\n\n🔹 {}\nЦена: {} ₴\nКупить: /buy_{}'.format(
                        product['title'], product['price'],
                        utils.convertInt(product['id']))
                    photos = db.get_sale_app_photos(product['id'])
                    media_group = []
                    for num in range(len(photos)):
                        media_group.append(
                            types.InputMediaPhoto(
                                photos[num]['photo'],
                                caption=text if num == 0 else ''))
                    bot.send_media_group(self.user.message.chat.id,
                                         media_group)
            else:
                bot.send_message(
                    self.user.message.chat.id,
                    'К сожалению, сейчас ничего нет в продаже. Почему бы не продать что-то?',
                    reply_markup=Page(self.user).getMarkup())

        elif button == self.sellButton:
            if db.check_sale_rules(self.user.id) == 1:
                #тут переход на другую страницу
                self.user.setState('sale')

                #обновление страницы
                bot.reply_to(
                    self.user.message,
                    "Вы начали создание товара на продажу, если Вы передумали что-либо продавать или ввели неккоректные данные, нажмите кнопку Отмена. После создания заявки на продажу, модераторы проверят её и Ваш товар станет доступен для покупки другим пользователям. Статус обработки заявки можно посмотреть в личном кабинете.",
                    reply_markup=Page(self.user).getMarkup()).wait()

                bot.send_message(self.user.id,
                                 "Напишите название вашего товара")
            else:
                markup = telebot.types.InlineKeyboardMarkup()
                markup.add(
                    telebot.types.InlineKeyboardButton(
                        text='Принять соглашение',
                        callback_data='sale_confirm_rules'))
                bot.send_message(
                    self.user.id,
                    'Перед созданием первого товара Вам нужно ознакомиться с правилами и советами:\n\n'
                    '- Сделайте хорошие фотографии с нескольких ракурсов\n'
                    '- Составьте подробное описание товара\n',
                    reply_markup=markup)
        elif button == self.infoButton:
            bot.send_message(
                self.user.id, '{} \n {}'.format(msg.info_text,
                                                self.user.balance))
        elif button == self.supportButton:
            self.user.setState('support')

            bot.reply_to(
                self.user.message,
                "Все Ваши сообщения, отправленные после этого будут переданы администрации\nДля завершения нажмите на кнопку 'Завершить'",
                reply_markup=Page(self.user).getMarkup())

示例#48

0

显示文件

 def command(self, event):
     demojised = emoji.demojize(event["spec"][0])
     event["stdout"].write("%s: %s" % (event["user"].nickname, demojised))

示例#49

0

显示文件

文件： superchat.py 项目： hetima333/nier-bot

    async def sc(self, ctx):
        user = ctx.message.author
        msg = ctx.message.clean_content[4:]

        await ctx.message.delete()

        # 金額のランダム生成
        money = self._get_random_money()
        # 金額に対応した色
        colors = self._get_money_colors(money)

        # 矩形を作成して表示
        main_color = colors['main_color']
        back_color = colors['back_color']
        name_color = colors['name_color']
        text_color = colors['text_color']

        format_msg, emoji_list = self._format_text(36, msg)
        stamp_list = await self._get_custom_stamp_list(ctx.guild, msg)

        lines = format_msg.count(os.linesep)
        text_height = 22
        font_size = 20
        height = 150 + lines * text_height

        im = Image.new("RGBA", (450, height), tuple(main_color))
        draw = ImageDraw.Draw(im)
        draw.rectangle((0, 100, 450, height), fill=tuple(back_color))

        # 文字合成
        name_font = ImageFont.truetype(
            str(self.path / "font/migu-1m-regular.ttf"), font_size)
        # ユーザー名のみ少し薄い色
        draw.multiline_text((110, 20),
                            user.display_name,
                            fill=tuple(name_color),
                            font=name_font)
        del name_font

        text_font = ImageFont.truetype(
            str(self.path / "font/migu-1m-bold.ttf"), font_size)
        draw.multiline_text((110, 50),
                            f"¥ {'{:,}'.format(money)}",
                            fill=tuple(text_color),
                            font=text_font)

        draw.multiline_text((20, 115),
                            format_msg,
                            fill=tuple(text_color),
                            font=text_font)

        offset = [0, 0]
        prev_str = ''
        for i, s in enumerate(format_msg):
            if unicodedata.east_asian_width(s) in 'FWA':
                offset[0] += font_size
            else:
                offset[0] += int(font_size / 2)

            # カスタム絵文字と絵文字を画像に置換
            if s in ['@', '%'] and prev_str == '&':
                pos = [20 + offset[0] - font_size, 115 + offset[1]]
                # ダミー文字を塗りつぶし
                draw.rectangle(
                    (pos[0], pos[1], pos[0] + font_size, pos[1] + font_size),
                    fill=tuple(back_color))
                # カスタム絵文字の場合
                if s == '@':
                    data = io.BytesIO(await stamp_list.pop(0).read())
                    stamp_img = Image.open(data).convert('RGBA').resize(
                        (20, 20), Image.BICUBIC)
                    im.paste(stamp_img, (pos[0], pos[1]), stamp_img.split()[3])
                # 絵文字の場合
                elif s == '%':
                    if len(emoji_list) > 0:
                        emoji_str = emoji.demojize(emoji_list.pop(0))[1:-1]
                        # 変換されない絵文字が存在するので念の為チェック（2020/10/4時点で⛩のみ）
                        emoji_img_path = self.path / f'img/emoji/{emoji_str}.png'
                        if os.path.isfile(emoji_img_path):
                            emoji_img = Image.open(emoji_img_path).convert(
                                'RGBA')
                            im.paste(emoji_img, (pos[0], pos[1]),
                                     emoji_img.split()[3])

            prev_str = s

            if s == '\n':
                offset[0] = 0
                offset[1] += text_height

        # ユーザーのサムネを取得してImageに変換
        data = io.BytesIO(await user.avatar_url.read())
        thum = Image.open(data).convert('RGBA')
        del data
        thum = thum.resize((60, 60), Image.BICUBIC)
        # 画像合成
        mask = Image.open(self.path /
                          "img/superchat/mask_circle.jpg").convert('L')
        im.paste(thum, (25, 20), mask.resize((60, 60), Image.HAMMING))

        im.save(self.path / "img/superchat/superchat.png")
        del im

        await ctx.send(file=discord.File(self.path /
                                         "img/superchat/superchat.png"))

示例#50

0

显示文件

文件： Twitter_Methods.py 项目： hopeemac/Endorsements_Data

def clean_tweets_opt(tweet, lower = True, demoji = True, punc = True, stopwords = [],                      num = False, url = True, stem = False, repeatedChar = False, users = False):
    # Need to Clean Out URLs before Tokenization
    if url:
        tweet = re.sub('htt[^ ]*' ,'URL', tweet)
    
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    cleanWords = twtokenizer.tokenize(tweet)
    
    # lower
    # Convert to Lowercase
    if lower:
        cleanWords = [word.lower() for word in cleanWords]
    
    # demoji
    # Convert Emoji's to Word Label
    if demoji:
        cleanWords = [emoji.demojize(word) for word in cleanWords]

    # punc
    # Remove punctuation, only removes puncutation if only char in token
    if punc:
        cleanWords = [word for word in cleanWords if word not in punctuation]
     
    # Remove StopWords
    # Preferred list passed through function parameters
    cleanWords = [word for word in cleanWords if word not in stopwords]
    
    # num
    # Identify Digits & Convert to Num
    if num:
        cleanWords = [re.sub("\d+", "NUM", x) for x in cleanWords]
    
    # url; opt = remove, replace
    # Remove all Web/URL References
    #if url:
    #    cleanWords = [word for word in cleanWords if word[0:3] != 'htt']
    # cleanWords = ['URL' if word[0:3] == 'htt' else word for word in cleanWords ]
    
    # stem
    # Stem Words
    if stem:
        cleanWords = [stemmer.stem(x) for x in cleanWords] # call stemmer to stem the input
    
    # repeatedChar
    # Remove Multiple Letters, Replace with only 3 so they are distinguishable, but standardized
    if repeatedChar:
        cleanWords = [re.sub(r'(.)\1{2,}', r'\1\1\1', word) for word in cleanWords ]
    
    # users
    # Change all @ References to USER
    if users:
        cleanWords = ['USER' if word[0] == '@' else word for word in cleanWords ]
    
    ## Non-Optional Pre-processing
    # Trim whitespace
    
    
    # Remove Empty Vectors
    cleanWords = [x for x in cleanWords if x != '']
    
    return cleanWords

示例#51

0

显示文件

文件： test_core.py 项目： 521xueweihan/emoji

def test_demojize_name_only():
    for name in emoji.EMOJI_UNICODE.keys():
        oneway = emoji.emojize(name, False)
        roundtrip = emoji.demojize(oneway)
        assert name == roundtrip, "%s != %s" % (name, roundtrip)

示例#52

0

显示文件

文件： test_core.py 项目： 521xueweihan/emoji

def test_demojize_complicated_string():
    constructed = u"testing :baby::emoji_modifier_fitzpatrick_type-3: with :eyes: :eyes::eyes: modifiers :baby::emoji_modifier_fitzpatrick_type-5: to symbols ヒㇿ"
    emojid = emoji.emojize(constructed)
    destructed = emoji.demojize(emojid)
    assert constructed == destructed, "%s != %s" % (constructed, destructed)

示例#53

0

显示文件

文件： test_core.py 项目： beekpr/emoji

def test_shortcut_translation():
    for shortcut in emoji.shortcuts.SHORTCUTS.keys():
        actual = emoji.demojize(shortcut, use_shortcuts=True)
        assert actual!=shortcut
        expected = emoji.shortcuts.SHORTCUTS[shortcut]
        assert expected == actual, "%s != %s" % (expected, actual)

示例#54

0

显示文件

def test_smile_emoji():
    txt = u'(<some text> :smile:)'
    assert emoji.emojize(
        emoji.demojize(emoji.emojize(txt, use_aliases=True),
                       use_shortcuts=True)) == emoji.emojize(txt,
                                                             use_aliases=True)

示例#55

0

显示文件

def read_data(X_train, X_test, Y_path, sentence_txt, bigdict, word2vec_model):
    TRAIN_NUM = 119018
    try:
        print('Loading Sentences')
        sentences = word2vec.LineSentence(sentence_txt)
    except:
        print('Reading data to sentences')
        data = pd.read_csv(X_train)
        X_data = data['comment'].values
        testdata = pd.read_csv(X_test)
        X_testdata = testdata['comment'].values
        print(X_data.shape)  # (12000,)
        print(X_testdata.shape)
        X_words = []
        jieba.set_dictionary(bigdict)
        for i in range(len(X_data)):
            line = emoji.demojize(X_data[i])
            seg_list = list(jieba.cut(line, cut_all=False))
            X_words.append(seg_list)
        for j in range(len(X_testdata)):
            line = emoji.demojize(X_testdata[j])
            seg_list = list(jieba.cut(line, cut_all=False))
            X_words.append(seg_list)

        out = open(sentence_txt, "w")
        for sen in X_words:
            for word in sen:
                out.write(word)
                out.write(' ')
            out.write('\n')
        out.close()
        sentences = word2vec.LineSentence(sentence_txt)

    # word2vec
    try:
        print('Loading word2vec model')
        w2v_model = word2vec.Word2Vec.load(word2vec_model)
    except:
        print('Training word2vec model')
        w2v_model = word2vec.Word2Vec(sentences,
                                      iter=32,
                                      size=128,
                                      min_count=3,
                                      workers=4,
                                      sg=1)
        w2v_model.save(word2vec_model)

    embedding_matrix = np.zeros(
        (len(w2v_model.wv.vocab.items()) + 1, w2v_model.vector_size))
    word2idx = {}

    vocab_list = [(word, w2v_model.wv[word])
                  for word, _ in w2v_model.wv.vocab.items()]
    for v, vocab in enumerate(vocab_list):
        word, vec = vocab
        embedding_matrix[v + 1] = vec
        word2idx[word] = v + 1

    global embedding_layer
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                                output_dim=embedding_matrix.shape[1],
                                weights=[embedding_matrix],
                                trainable=False)
    X_vecs = []
    readfile = open(sentence_txt, "r")
    for line in readfile:
        new_doc = []
        for word in line.split():
            try:
                new_doc.append(word2idx[word])
            except:
                new_doc.append(0)
        X_vecs.append(new_doc)
        if len(X_vecs) >= TRAIN_NUM:
            break
    X = np.array(X_vecs)
    print(X.shape)

    label = pd.read_csv(Y_path)
    Y_data = label['label'].values
    Y = np.array(Y_data)
    Y = Y[0:TRAIN_NUM]

    return X, Y

示例#56

0

显示文件

文件： DPlib.py 项目： HACP/RHETORICS

 def getEMOJI(self, text):
     def replacement(match):
         return ' TK.EMOJI.'+match.group(1).upper() + ' '
         
     text = emoji.demojize(text)
     return re.sub(u'\:([a-z_-]+)\:', replacement, text)

示例#57

0

显示文件

文件： preprocess.py 项目： sky1456723/ML2018FALL

import re

print("Start cleaning Data")
jieba.load_userdict("./data/dict.txt.big")
train_file = open("./data/train_x.csv")
train_x = train_file.readlines()
train_file.close()
punctuation_search = re.compile(
    "[\s+\.\!\/_,$%^*(+\"\']+|[+——\>\<！，。?？、\-～~@#￥%……&*（）：]+")
clean_data = []
for id in range(len(train_x)):
    train_x[id] = train_x[id].replace("\n", "")
    train_x[id] = train_x[id].split(",", maxsplit=1)[1]
    #train_x[id] = emoji.demojize(train_x[id])
    word_list = jieba.lcut(train_x[id])
    word_list = [emoji.demojize(i) for i in word_list]
    clean_list = []
    for word in word_list:
        check = punctuation_search.match(word, 0)
        if type(check) == type(None):
            clean_list.append(word)
    if len(clean_list) != 0:
        clean_data.append(clean_list)

print("Start training word2vec")
word2vec_model = gensim.models.Word2Vec(clean_data,
                                        size=200,
                                        window=5,
                                        min_count=5,
                                        workers=3,
                                        iter=30)

示例#58

0

显示文件

文件： preprocessing_funcs.py 项目： Christian-ngnie/russian-troll-tweets-nlp

def emoji_as_words(emoji_list):
    emoji_literal = [
        emoji.demojize(em, delimiters=('', '')) for em in emoji_list
    ]
    return emoji_literal

示例#59

0

显示文件

文件： voting.py 项目： webdroid163/Chaos

def parse_comment_for_vote(body):
    """ turns a comment into a vote, if possible """
    return parse_emojis_for_vote(demojize(body))

示例#60

0

显示文件

文件： first.py 项目： lpirola13/EasySport-Understanding

def preprocessing(text):
    # Rimuovo i newline
    text = text.replace("\n", "")
    # Rimuovo i link
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    # Sostituisco le emoji con i loro aliases
    text = emoji.demojize(text)
    # Rimuovo i due punti prima e dopo dell'alias
    text = re.sub(r'(:)(.*?)(:)', r' \2 ', text)
    # Rimuovo l'underscore se gli alias sono composti da più parole
    text = re.sub(r'_', ' ', text)
    # Rimuovo lo slash
    text = re.sub(r'/', ' ', text)
    # Rimuovo |
    text = re.sub(r'\|', ' ', text)
    # Rimuovo le parentesi dal testo
    text = re.sub(r'(\()([^)]+)(\))', '\g<2>', text)
    # Sostituisco ']' con '] '
    text = text.replace(']', '] ')
    # Rimuovo i numeri dal testo
    text = re.sub(r'(?<![a-zA-Z]-)(\b\d+\b)', ' ', text)
    # Rimuovo minuti / millioni
    text = re.sub(r'\d+m\b', ' ', text)
    # Rimuovo posizioni
    text = re.sub(r'\d+th\b', ' ', text)
    text = re.sub(r'\d+st\b', ' ', text)
    text = re.sub(r'\d+nd\b', ' ', text)
    text = re.sub(r'\d+rd\b', ' ', text)
    # Rimuovo ore
    text = re.sub(r'\d+h\b', ' ', text)
    text = re.sub(r'\d+am\b', ' ', text)
    text = re.sub(r'(\b\d+h\d+\b)', ' ', text)
    # Rimouvo anni
    text = re.sub(r'\d+s\b', ' ', text)
    # Sostituisco statistiche
    text = re.sub(r'(\d+)(ppg)', ' point per game', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(pt(s?))', ' point', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(reb(s?))', ' rebound', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(rpg)',
                  ' rebound per game',
                  text,
                  flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(ast(s?))', ' assist', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(apg)', ' assist per game', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(stl(s?))', ' steal', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(spg)', ' steal per game', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(blk(s?))', ' block', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(bpg)', ' block per game', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(OT(s?))', ' overtime', text)
    text = re.sub(r'(\d+)(pm)',
                  ' three-point field goal made',
                  text,
                  flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(pa)',
                  ' three-point field goal attempted',
                  text,
                  flags=re.IGNORECASE)
    text = re.sub(r'(\d+)(P%)', ' three-point field goal', text)
    # Sostituisco acronimo ref
    text = re.sub(r'(\bTS%\b)', 'throw shooting percentage', text)
    # Sostituisco free-kick
    text = re.sub(r'(\bfree kick(s?)\b)',
                  'free-kick',
                  text,
                  flags=re.IGNORECASE)
    # Sostituisco free-throw'
    text = re.sub(r'(\bfree throw(s?)\b)',
                  'free-throw',
                  text,
                  flags=re.IGNORECASE)
    text = re.sub(r'(\bfreethrown(s?)\b)',
                  'free-throw',
                  text,
                  flags=re.IGNORECASE)
    # Sostituisco acronimo ref
    text = re.sub(r'(\bthrow%\b)', 'throw shooting percentage', text)
    # Sostituisco acronimo ref
    text = re.sub(r'(\bfield goal%\b)', 'field goal', text)
    # Sostituisco acronimo ref
    text = re.sub(r'(\bref\b)', 'referee', text, flags=re.IGNORECASE)
    # Sostituisco acronimo ref
    text = re.sub(r'(\bOT\b)', 'overtime', text)
    # Sostituisco acronimo ET
    text = re.sub(r'(\bET\b)', 'extra-time', text)
    # Sostituisco acronimo WC
    text = re.sub(r'(\bWC\b)', 'World Cup', text)
    # Sostituisco acronimo EPL
    text = re.sub(r'(\bEPL\b)', 'English Premier League', text)
    # Sostituisco acronimo PL
    text = re.sub(r'(\bPL\b)', 'Premier League', text)
    # Sostituisco acronimo VAR
    text = re.sub(r'(\bVAR\b)', 'Video Assistant Referee', text)
    # Sostituisco acronimo UCL
    text = re.sub(r'(\bUCL\b)', 'Uefa Champions League', text)
    # Sostituisco acronimo CL
    text = re.sub(r'(\bCL\b)', 'Champions League', text)
    # Sostituisco acronimo UEL
    text = re.sub(r'(\bUEL\b)', 'Uefa Europa League', text)
    # Sostituisco acronimo EL
    text = re.sub(r'(\bEL\b)', 'Europa League', text)
    # Sostituisco acronimo SG
    text = re.sub(r'(\bsg\b)', 'shooting guard', text, flags=re.IGNORECASE)
    # Sostituisco acronimo G
    text = re.sub(r'(\bg\b)', 'shooting guard', text, flags=re.IGNORECASE)
    # Sostituisco acronimo C
    text = re.sub(r'(\bc\b)', 'center', text, flags=re.IGNORECASE)
    # Sostituisco acronimo PF
    text = re.sub(r'(\bpf\b)', 'power forward', text, flags=re.IGNORECASE)
    # Sostituisco acronimo SF
    text = re.sub(r'(\bsf\b)', 'small forward', text, flags=re.IGNORECASE)
    # Sostituisco acronimo F
    text = re.sub(r'(\bf\b)', 'forward', text, flags=re.IGNORECASE)
    # Sostituisco acronimo PPG
    text = re.sub(r'(\bppg\b)', 'point per game', text, flags=re.IGNORECASE)
    # Sostituisco acronimo PTS
    text = re.sub(r'(\bpt(s?)\b)', 'point', text, flags=re.IGNORECASE)
    # Sostituisco acronimo REBS
    text = re.sub(r'(\breb(s?)\b)', 'rebound', text, flags=re.IGNORECASE)
    # Sostituisco acronimo RPG
    text = re.sub(r'(\brpg\b)', 'rebound per game', text, flags=re.IGNORECASE)
    # Sostituisco acronimo ASTS
    text = re.sub(r'(\bast(s?)\b)', 'assist', text, flags=re.IGNORECASE)
    text = re.sub(r'(\bassts\b)', 'assist', text, flags=re.IGNORECASE)
    # Sostituisco acronimo APG
    text = re.sub(r'(\bapg\b)', 'assist per game', text, flags=re.IGNORECASE)
    # Sostituisco acronimo STL
    text = re.sub(r'(\bstl(s?)\b)', 'steal', text, flags=re.IGNORECASE)
    # Sostituisco acronimo SPG
    text = re.sub(r'(\bspg(s?)\b)',
                  'steal per game',
                  text,
                  flags=re.IGNORECASE)
    # Sostituisco acronimo BLK
    text = re.sub(r'(\bblk(s?)\b)', 'block', text, flags=re.IGNORECASE)
    # Sostituisco acronimo BPG
    text = re.sub(r'(\bblk(s?)\b)',
                  'block per game',
                  text,
                  flags=re.IGNORECASE)
    # Sostituisco triple-double
    text = re.sub(r'(\btriple double\b)',
                  'triple-double',
                  text,
                  flags=re.IGNORECASE)
    # Sostituisco double-double
    text = re.sub(r'(\bdouble double\b)',
                  'double-double',
                  text,
                  flags=re.IGNORECASE)
    # Sostituisco acronimo FGM
    text = re.sub(r'(\bFGM\b)', 'field goal made', text, flags=re.IGNORECASE)
    # Sostituisco acronimo FGA
    text = re.sub(r'(\bFGA\b)',
                  'field goal attempted',
                  text,
                  flags=re.IGNORECASE)
    # Sostituisco acronimo FG
    text = re.sub(r'(\bFG\b)', 'field goal', text, flags=re.IGNORECASE)
    # Sostituisco acronimo FTM
    text = re.sub(r'(\bFTM\b)', 'free throw made', text, flags=re.IGNORECASE)
    # Sostituisco acronimo FTA
    text = re.sub(r'(\bFTA\b)',
                  'free throw attempted',
                  text,
                  flags=re.IGNORECASE)
    # Sostituisco acronimo FT
    text = re.sub(r'(\bFT\b)', 'free throw', text, flags=re.IGNORECASE)
    # Rimuovo highlight(s)
    text = re.sub(r'(\bhighlight(s?)\b)', '', text, flags=re.IGNORECASE)
    # Rimuovo (pre/post)(-)(match thread)
    text = re.sub(r'(\bpost\b( ?))?(\bpre\b( ?))?(-?)(\bmatch\b) (\bthread\b)',
                  '',
                  text,
                  flags=re.IGNORECASE)
    # Rimuovo (pre/post)(-)(game thread)
    text = re.sub(r'(\bpost\b( ?))?(\bpre\b( ?))?(-?)(\bgame\b) (\bthread\b)',
                  '',
                  text,
                  flags=re.IGNORECASE)
    # Rimuovo (daily)(discussion)(thread)
    text = re.sub(r'(\bdaily\b( ?))?(\bdiscussion(s)?\b)(( ?)\bthread\b)?',
                  '',
                  text,
                  flags=re.IGNORECASE)
    # Rimuovo breaking
    text = re.sub(r'(\bbreaking\b)', '', text, flags=re.IGNORECASE)
    # Rimuovo free talk friday
    text = re.sub(r'(\bfree talk friday\b)', '', text, flags=re.IGNORECASE)
    # Rimuovo VIDEO
    text = re.sub(r'(\bVIDEO\b)', '', text)
    # Rimuovo +
    text = re.sub(r'(\+)', '', text)
    # Rimuovo le valute
    text = text.replace('£', '')
    text = text.replace('$', '')
    text = text.replace('€', '')
    # rimuovo acronimo OC
    text = re.sub(r'(\bOC\b)', '', text)
    # Rimuovo i doppi spazi
    text = re.sub(r' {2,}', ' ', text)
    lemmas = [
        token for token in nlp(text)
        if not token.is_stop and not token.is_punct
    ]
    text = " ".join(str(token) for token in lemmas)
    text = text.replace('Serie', 'Serie A')
    return text