def extract_years(snippet, output): """ function extracts the dates and fill them with the computed confidence score of function extract_entities_textrazor :param snippet: :param output: :return: """ jar_files = os.path.join(os.path.dirname(__file__), 'jars') sutime = SUTime(jars=jar_files, mark_time_ranges=True) res = json.dumps(sutime.parse(snippet), sort_keys=True, indent=4) dates_list = [] for i in range(len(res)): if res[i:i+5] == 'value': j = i+9 while res[j] != '"': j = j+1 dates_list.append(''.join(res[i+9:j])) dic_year = output['Y'] dates_list_new = {'entity':[], 'confidenceScore': [] } for i in range(len(dic_year['entity'])): for ele in dates_list: if ele.__contains__(dic_year['entity'][i][0]): if ele not in dates_list_new['entity']: dates_list_new['entity'].append(ele) dates_list_new['confidenceScore'].append(dic_year['confidenceScore'][i]) output['Y'] = dates_list_new return output
def sutime_function(text): translator = Translator() traducere = translator.translate(text, src='ro', dest="en").text java_target = "java\\target" jar_files = os.path.join(os.path.dirname(__file__), java_target) sutime = SUTime(jars=jar_files, mark_time_ranges=True) ttext = [] ttype = [] tmpdictionar = {} for x in sutime.parse(traducere): for value, key in x.items(): if value == "text": valoare = convert_to_romana(key) ttext.append(valoare) elif value == "type": valoare2 = convert_to_romana(key) ttype.append(valoare2) for x in range(len(ttext)): try: tmpdictionar[ttype[x]].append(ttext[x]) except: tmpdictionar[ttype[x]] = [ttext[x]] return tmpdictionar
class NLUWrapper(object): def __init__(self, host='localhost', port=5001, **kwargs): self.host, self.port = host, port self.sutime = SUTime(jars=os.path.join(os.path.dirname(__file__), 'python-sutime', 'jars'), mark_time_ranges=True) print 'Initialized with {}:{}'.format(self.host, self.port) def annotate(self, in_utterance, modules=()): sutime_response = None try: if 'SUTime' in modules: sutime_response = self.sutime.parse(in_utterance) modules = [module for module in modules if module != 'SUTime'] response = requests.post('http://{}:{}/annotate'.format( self.host, self.port), json={ 'state': { 'utterance': in_utterance }, 'modules': modules }, timeout=5) except requests.Timeout: return {} assert response.status_code == 200, 'Error calling the NLU service' result = response.json() if sutime_response is not None: result['annotations']['SUTime'] = sutime_response return result def annotate_sentiment(self, in_utterance): response = self.annotate(in_utterance, modules=['Preprocessor', 'VaderNLTK']) return response['annotations']['sentiment'] def annotate_ner(self, in_utterance): response = self.annotate(in_utterance, modules=['Preprocessor', 'StanfordNER']) return response['annotations'].get('ner', {}) def annotate_pos(self, in_utterance): response = self.annotate(in_utterance, modules=['Preprocessor', 'MorphoTagger']) return response['annotations'].get('postag', []) def annotate_abuse(self, in_utterance): response = self.annotate( in_utterance, modules=['Preprocessor', 'AlanaAbuseDetector']) return response['annotations'].get('abuse', {})
class DateLinker(BasePipeline): def __init__(self, resource_folder=None): self.annotator_name = 'Date_Linker' if resource_folder is None: self.resource_folder = os.path.join(os.path.dirname(__file__), '../resources/sutime/') self.sutime = SUTime(jars=self.resource_folder) def run(self, document): dates = self.sutime.parse(document.text) pattern = re.compile(r"^-*\d*-*\d*-*\d*-*$") for date in dates: if date["type"] == "DATE" and pattern.match(date["value"]): val = date["value"] if val[0] == '-': if len(val[1:]) == 4: stdform = val + '-00-00T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' elif len(val[1:]) == 7: stdform = val + '-00T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' elif len(val[1:]) == 10: stdform = val + 'T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' else: stdform = val + '^^<http://www.w3.org/2001/XMLSchema#dateTime>' else: if len(val) == 4: stdform = val + '-00-00T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' elif len(val) == 7: stdform = val + '-00T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' elif len(val) == 10: stdform = val + 'T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' else: stdform = val + '^^<http://www.w3.org/2001/XMLSchema#dateTime>' start = date["start"] end = date["end"] entity = Entity(uri=stdform, boundaries=(start, end), surfaceform=document.text[start:end], annotator=self.annotator_name) document.entities.append(entity) return document
class timeDelta: def __init__(self, path): # Initialize SUtime jar_files = os.path.join(os.path.dirname(path), 'jars') self.sutime = SUTime(jars=jar_files, mark_time_ranges=False, include_range=True) def get_times(self, text): # get all time values found by SUtime parsed = self.sutime.parse(text) values = [] for dic in parsed: values.append(dic['value']) return values
def extract_entitites(snippet): """ this function gets :param snippet: a snippet in English :return: and returns back the extracted person name, organization name, location and year in a dictionary namely output """ nlp = StanfordCoreNLP('http://localhost:9000') res = nlp.annotate(snippet, properties={ 'annotators': 'ner', #'sutime' 'outputFormat': 'json', #'timeout': 1000, }) output = {'RN':[], 'U':[], 'Y':[]} """ for extracting the university and persons names""" for sent in range(len(res['sentences'])): for element in res['sentences'][sent]['tokens']: if element['ner'] == 'PERSON': output['RN'].append(element['word']) if element['ner'] == 'ORGANIZATION': #or element['ner'] == 'LOCATION' : output['U'].append(element['word']) """ for extracting the years""" jar_files = os.path.join(os.path.dirname(__file__), 'jars') sutime = SUTime(jars=jar_files, mark_time_ranges=True) res = json.dumps(sutime.parse(snippet), sort_keys=True, indent=4) for i in range(len(res)): if res[i:i+5] == 'value': j = i+9 while res[j] != '"': j = j+1 output['Y'].append(''.join(res[i+9:j])) return output
class Streambot: """Stream Twitter and look for tweets that contain targeted words, when tweets found look for datetime and room, if present save tweet to OutgoingTweet model. Ex. bot = Streambot() # to run a stream looking for tweets about PyCon bot.run_stream(["PyCon"]) """ def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) self.slacker = Slacker(s.SLACK_TOKEN) def setup_auth(self): """Set up auth stuff for api and return tweepy api object""" auth = tweepy.OAuthHandler(s.openspaces["CONSUMER_KEY"], s.openspaces["CONSUMER_SECRET"]) auth.set_access_token(s.openspaces["ACCESS_TOKEN"], s.openspaces["ACCESS_TOKEN_SECRET"]) api = tweepy.API(auth) return api def run_stream(self, search_list=None): """Start stream, when matching tweet found on_status method called. search_list arg is a list of terms that will be looked for in tweets """ if search_list == None: raise ValueError( "Need a list of search terms as arg to run_stream") stream = tweepy.Stream(auth=self.api.auth, listener=self.stream_listener) stream.filter(track=search_list) def send_mention_tweet(self, screen_name): """Mention a user in a tweet from bot letting them know that their tweet has been recieved and that we will send out reminders about their event. """ hours_mins = time_utils.get_local_clock_time() mention = "@{} just saw your Open Spaces tweet at {}." mention += " Pending approval we'll retweet a reminder before your event!" mention = mention.format(screen_name, hours_mins) try: self.api.update_status(status=mention) except: # if same user tweets valid openspaces tweet at exact same clock time # it causes a duplicate tweet which bot can't send loggly.info( "duplicate tweet by openspaces bot in send_mention_tweet") def send_slack_message(self, channel, message): """Send a slack message a channel channel options: #outgoing_tweets #need_review #event_conflict """ self.slacker.chat.post_message(channel, message) def parse_time_room(self, tweet): """Get time and room number from a tweet using SUTime and tweet_utils""" extracted_time = self.sutime.parse(tweet) time_and_room = tweet_utils.get_time_and_room(tweet, extracted_time) return time_and_room def value_check(self, time_room_obj): """Returns a tuple with the counts of values extracted from a tweet in the parse_time_room method. This tuple is used to decide how bot will respond to tweet. """ num_room_values = len(time_room_obj["room"]) num_time_values = len(time_room_obj["date"]) return (num_room_values, num_time_values) def retweet_logic(self, tweet, tweet_id, screen_name, user_id): """Use SUTime to try to parse a datetime out of a tweet, if successful save tweet to OutgoingTweet to be retweeted """ # use SUTime to parse a datetime out of tweet time_room = self.parse_time_room(tweet) # make sure both time and room extracted and only one val each val_check = self.value_check(time_room) if val_check == (1, 1): room = time_room["room"][0] date_mention = tweet_utils.check_date_mention(tweet) converted_time = time_utils.convert_to_utc(time_room["date"][0], date_mention) # check for a time and room conflict, only 1 set of retweets per event # default time range that a room is resrved for is -15 +30 mins conflict = db_utils.check_time_room_conflict(converted_time, room) if not conflict: # send message to slack when a tweet is scheduled to go out slack_message = "{} From: {}, id: {}".format( tweet, screen_name, user_id) self.send_slack_message('#outgoing_tweets', slack_message) self.send_mention_tweet(screen_name) # This record lets us check that retweets not for same event db_utils.create_event(description=tweet, start=converted_time, location=room, creator=screen_name) tweet_utils.schedule_tweets(screen_name, tweet, tweet_id, converted_time) loggly.info( "scheduled this tweet for retweet: {}".format(tweet)) else: message = """Tweet recived for an event bot is already scheduled to retweet about. Sender: {}, room: {}, time: {}, tweet: {} tweet_id: {} """ message = message.format(screen_name, room, converted_time, tweet, tweet_id) self.send_slack_message("#event_conflict", message) loggly.info(message) elif val_check == (0, 0): # tweet found but without valid time or room extracted, ignore pass else: # tweet with relevant information but not exactly 1 time & 1 room message = """Tweet found that needs review: {} tweet_id: {} screen_name: {}, user_id: {} """ message = message.format(tweet, tweet_id, screen_name, user_id) self.send_slack_message("#need_review", message)
import os import json from sutime import SUTime if __name__ == '__main__': test_case = u'I need a desk for tomorrow from 2pm to 3pm' jar_files = 'C:\Users\Leandra\Anaconda2\lib\site-packages\sutime\jars' jar_files = 'C:\Users\Leandra\Documents\Fall2016\NLP\carpool-search\jars' print(jar_files) sutime = SUTime(jars=jar_files, mark_time_ranges=True) print(json.dumps(sutime.parse(test_case), sort_keys=True, indent=4))
class RetweetBot: def __init__(self): # Twitter API setup auth = tweepy.OAuthHandler(os.environ.get('CONSUMER_KEY'), os.environ.get('CONSUMER_SECRET')) auth.set_access_token(os.environ.get('ACCESS_TOKEN'), os.environ.get('ACCESS_TOKEN_SECRET')) self.api = tweepy.API(auth) self.tweet_list = [] self.relevance_scores = [] # bad words response = requests.get(BAD_WORDS_URL) self.bad_words = response.text.split('\n') # stop words self.stopwords = list(stopwords.words('english')) # sutime jar_files = os.environ.get('JAR_FILES', '../python-sutime/jars') self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) # nltk data append nltk.data.path.append( os.environ.get('NLTK_CORPUS', '/webapps/hackor/hackor/nltk_data')) ''' Get all tweets ''' def get_tweets(self, topic="#pycon", quantity=10, result_type="recent,popular"): tweet_list = self.api.search(q=topic, count=quantity, lang='en', result_type=result_type) print("Retrieved {} candidate tweets.".format(len(tweet_list))) self.tweet_list += tweet_list def clear_tweets(self): self.tweet_list = [] self.relevance_scores = [] ''' Defining relevance score as the importance of the user tweeting Features: tweeter followers, friends, ratio number of hashtags in the tweet (smaller the better) (PageRank?) Remove tweets that have any bad words ''' def score(self, tweet): if not self.isSafe(tweet.text): return MAX_NEGATIVE if tweet.text.startswith('RT'): return MAX_NEGATIVE # influencer ratio influencer_ratio = 0 if tweet.user.friends_count: influencer_ratio = tweet.user.followers_count / tweet.user.friends_count #number of hashtags hashtags = tweet.text.count('#') #hashtag word length hashtagcount = 0 for word in tweet.text.split(): if word.startswith('#'): hashtagcount += len(word) final_score = influencer_ratio * (hashtagcount / 140) * 1.0 / ( 1 + hashtags) * tweet.favorite_count final_score = 1.0 return final_score ''' Computing Relevance for all tweets ''' def compute_relevance_scores(self): for _id, tweet in enumerate(self.tweet_list): if self.score(tweet) > 0.0: self.relevance_scores.append((_id, self.score(tweet))) self.relevance_scores.sort(key=lambda tup: tup[1], reverse=True) def compose_relevant_slack_messages(self, count=1): messages = [] if self.relevance_scores: message = '' for score in self.relevance_scores[0:count]: tweet_score = score[1] print tweet_score tweet = self.tweet_list[score[0]] message = "RT <https://twitter.com/" + tweet.user.screen_name + "|" + tweet.user.screen_name + ">" + " " + tweet.text message += "\n <https://twitter.com/" + tweet.user.screen_name + "/status/" + str( tweet.id) + "|Original Tweet>" messages.append(message) return messages def isSafe(self, tweet): result = True ret = tweet.replace('#', '') for word in self.bad_words: regex = r"\b(?=\w)" + re.escape(word) + r"\b(?!\w)" if re.search(regex, ret, re.IGNORECASE): result = False break return result ''' Get time and room number from a tweet ''' def get_time_and_room(self, tweet): result = {} result['date'] = [] result['room'] = [] time_slots = self.sutime.parse(tweet) tweet_without_time = tweet for time_slot in time_slots: tweet_without_time = tweet_without_time.replace( time_slot.get('text'), '') result['date'].append(time_slot.get('value')) filter_known_words = [ word.lower() for word in word_tokenize(tweet_without_time) if word.lower() not in (self.stopwords + nltk.corpus.words.words()) ] # regular expression for room room_re = re.compile('([a-zA-Z](\d{3})[-+]?(\d{3})?)') for word in filter_known_words: if room_re.match(word): result['room'].append(room_re.match(word).group()) return result
class Streambot: """ Stream Twitter and look for tweets that contain targeted words, when tweets found look for datetime and room, if present save tweet to OutgoingTweet model. Ex. bot = Streambot() # to run a stream looking for tweets about PyCon bot.run_stream(["PyCon"]) """ def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) self.tz = pytz.timezone('US/Pacific') jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) def setup_auth(self): """ Set up auth stuff for api and return tweepy api object """ auth = tweepy.OAuthHandler(s.listener["CONSUMER_KEY"], s.listener["CONSUMER_SECRET"]) auth.set_access_token(s.listener["ACCESS_TOKEN"], s.listener["ACCESS_TOKEN_SECRET"]) api = tweepy.API(auth) return api def run_stream(self, search_list=[]): """ Start stream, when matching tweet found on_status in StreamListener called. search_list arg is a list of terms that will be looked for in tweets """ if search_list == []: raise ValueError( "Need a list of search terms as arg to run_stream") stream = tweepy.Stream(auth=self.api.auth, listener=self.stream_listener) stream.filter(track=search_list) def convert_to_utc(self, talk_time): """ Convert the datetime string we get from SUTime to utcnow """ # get correct local year, month, dat local_date = datetime.now(self.tz) local_date_str = datetime.strftime(local_date, "%Y %m %d") year, month, day = local_date_str.split(" ") # get SUTime parsed talk time and extract hours, mins dt_obj = parse(talk_time) local_time_str = datetime.strftime(dt_obj, "%H %M") hours, mins = local_time_str.split(" ") # build up correct datetime obj, normalize & localize, switch to utc correct_dt = datetime(int(year), int(month), int(day), int(hours), int(mins)) tz_aware_local = self.tz.normalize(self.tz.localize(correct_dt)) local_as_utc = tz_aware_local.astimezone(pytz.utc) return local_as_utc def schedule_tweets(self, screen_name, tweet, tweet_id, talk_time): """ Take tweet and datetime, schedule num of reminder tweets at set intervals """ # check config table to see if autosend on config_obj = models.AppConfig.objects.latest("id") approved = 1 if config_obj.auto_send else 0 tweet_url = "https://twitter.com/{name}/status/{tweet_id}" embeded_tweet = tweet_url.format(name=screen_name, tweet_id=tweet_id) # set num of reminder tweets and interval in mins that tweets sent # num_tweets = 2 & interval = 15 sends 2 tweets 30 & 15 mins before num_tweets = 2 interval = 1 for mins in range(interval, (num_tweets * interval + 1), interval): remind_time = talk_time - timedelta(minutes=mins) message = "Coming up in {} minutes! {}".format(mins, embeded_tweet) print("message should be saved!!!") # saving the tweet to the OutgoingTweet table triggers celery stuff tweet_obj = models.Tweets(tweet=message, approved=approved, scheduled_time=remind_time) tweet_obj.save() def retweet_logic(self, tweet, tweet_id, screen_name): """ Use SUTime to try to parse a datetime out of a tweet, if successful save tweet to OutgoingTweet to be retweeted """ print(tweet, tweet_id) time_room = self.get_time_and_room(tweet) # check to make sure both time and room extracted and only one val for each val_check = [val for val in time_room.values() if len(val) == 1] if len(val_check) == 2: # way to mention a user after a valid tweet is recieved # time_stamp = datetime.datetime.utcnow() # mention = "@{} We saw your openspaces tweet!{}".format(screen_name, time_stamp) # self.api.update_status(status=mention) # need to make time from SUTime match time Django is using sutime_stuff = time_room["date"][0] print("sutime_stuff: {}".format(sutime_stuff)) talk_time = self.convert_to_utc(time_room["date"][0]) print("reult from convet to utc: {}".format(talk_time)) self.schedule_tweets(screen_name, tweet, tweet_id, talk_time) def get_time_and_room(self, tweet): """ Get time and room number from a tweet Written by Santi @ https://github.com/adavanisanti """ result = {} result["date"] = [] result["room"] = [] time_slots = self.sutime.parse(tweet) tweet_without_time = tweet for time_slot in time_slots: tweet_without_time = tweet_without_time.replace( time_slot.get("text"), "") result["date"].append(time_slot.get("value")) # filter_known_words = [word.lower() for word in word_tokenize(tweet_without_time) if word.lower() not in (self.stopwords + nltk.corpus.words.words())] filter_known_words = [ word.lower() for word in word_tokenize(tweet_without_time) ] # regular expression for room room_re = re.compile("([a-zA-Z](\d{3})[-+]?(\d{3})?)") for word in filter_known_words: if room_re.match(word): result["room"].append(room_re.match(word).group()) return result
#line = line.encode('utf-8') line = line.encode('ascii', 'ignore') sent_tokenize_list = sent_tokenize(line) #print sent_tokenize_list print "-------------------------------" res = [] dt = [] for sent in sent_tokenize_list: sent_list = re.split( '(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)(\s|[A-Z].*)', sent) print sent_list for sent_new in sent_list: #print sent_new #print sent,type(sent) #dt = list(json.dumps(sutime.parse(sent), sort_keys=True)) dt = sutime.parse(sent_new) #print dt #irrelev if no timeframe info if len(dt) == 0: #res.append(('',z)) continue else: word_neg_rel = '' word_pos_rel = '' #print dt tokens = nltk.word_tokenize(sent_new.lower()) neg_flag = 0 for token in tokens: if token.lower() in set_neg: neg_flag = 1 break
class NLUModule: def __init__(self, classifier_path=None, ner_path = None, sutime_jar_path = None): # Change the path according to your system if classifier_path is None: classifier_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\classifiers\english.muc.7class.distsim.crf.ser.gz" if ner_path is None: ner_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\stanford-ner.jar" if sutime_jar_path is None: sutime_jar_path = "C:\stanford_corenlp\stanford-corenlp-full-2018-02-27\stanford-corenlp-full-2018-02-27" self.stanford_classifier = classifier_path self.stanford_ner_path = ner_path self.sutime_path = sutime_jar_path # Creating Tagger Object self.st = StanfordNERTagger(self.stanford_classifier, self.stanford_ner_path) self.su = SUTime(jars=self.sutime_path, mark_time_ranges=True, include_range=True) self.weather_terms = ["weather", "climate", "precipitation", "sun", "rain", "cloud","snow", "hot", "humid", "cold", "sunny", "windy","cloudy", "rainy", "snowy", "misty", "foggy", "colder","hotter", "warmer", "pleasant"] self.greet_terms= ["hello","hey","howdy","hello","hi", "yo", "yaw"] self.closure_terms = ["no", "nope", "thank you", "bye", "tata", "thanks", "that will be all", "that's it", "that'll be all"] self.day_terms = ["dawn", "dusk", "morning", "evening", "noon","afternoon", "night", "tonight", "midnight", "midday"] #, "hours"] self.date_terms = ["today", "tomorrow", "yesterday"] def DiscoverIntentAndEntities(self, text): text = text.strip() tokenized_text = text if ' ' in text: tokenized_text = word_tokenize(text) classified_text = self.st.tag(tokenized_text) time_tags = self.su.parse(text) queryDate = None queryTime = None # pos_tags = pos_tag(tokenized_text) detectionResults = {"intent":INTENT_TYPES.UNK, "entities":{"LOCATION":"", "DATE":[], "TIME":[], "DURATION":[], "QUERIES":[]}} returnIntentAndEnt = {"intent":INTENT_TYPES.UNK, "entities":{"LOCATION":"", "DATE":"", "TIME":""}} for word,tag in classified_text: if 'LOCATION' in tag: detectionResults["entities"]["LOCATION"] += word + " " # elif 'DATE' in tag or word in self.date_terms: # detectionResults["entities"]["DATE"] = word + " " # elif 'DATE' in tag or word in self.day_terms: # detectionResults["entities"]["HOUR"] = word + " " elif 'O' in tag and word in WEATHER_TERMS: detectionResults["entities"]["QUERIES"].append(word) detectionResults["intent"] = INTENT_TYPES.WTH_QU if len(time_tags) > 0: for tag in time_tags: typeKey = tag["type"] detectionResults["entities"][typeKey].append(tag["value"]) if len(detectionResults["entities"]["DATE"]) > 0: for dateVal in detectionResults["entities"]["DATE"]: queryDate = dateVal if 'W' in dateVal: queryDate = dateVal.replace('W','') if len(detectionResults["entities"]["TIME"]) > 0: for timeVal in detectionResults["entities"]["TIME"]: timeTokens = timeVal.split('T') if queryDate is None: queryDate = timeTokens[0] timePart = timeTokens[1] if timePart in TIME_ABRV.keys(): timePart = DAY_TERMS[TIME_ABRV[timePart]] # Overwrite the query time with a generic time maps for MO, AF etc. queryTime = timePart #Do not overwrite query time if some time was detected already if queryTime is None: queryTime = timePart if detectionResults["intent"] == INTENT_TYPES.UNK: if text in GREET_TERMS: detectionResults["intent"] = INTENT_TYPES.GRT elif text in CLOSURE_TERMS: detectionResults["intent"] = INTENT_TYPES.CLS elif text in YES_TERMS: detectionResults["intent"] = INTENT_TYPES.ANS_YES elif text in NO_TERMS: detectionResults["intent"] = INTENT_TYPES.ANS_NO if len(detectionResults["entities"]["LOCATION"]) > 0 or len(detectionResults["entities"]["DATE"]) \ or len(detectionResults["entities"]["TIME"]) or len(detectionResults["entities"]["DURATION"]) > 0: detectionResults["intent"] = INTENT_TYPES.ANS_SLT returnIntentAndEnt["intent"] = detectionResults["intent"] returnIntentAndEnt["entities"]["LOCATION"] = detectionResults["entities"]["LOCATION"] if not queryDate is None: returnIntentAndEnt["entities"]["DATE"] = queryDate if not queryTime is None: returnIntentAndEnt["entities"]["TIME"] = queryTime return returnIntentAndEnt # nlu = NLUModule() # print(nlu.DiscoverIntentAndEntities("Tomorrow afternoon")) # print(nlu.DiscoverIntentAndEntities("How is the weather on Fifth March.")) # print(nlu.DiscoverIntentAndEntities("How is the weather in March.")) # print(nlu.DiscoverIntentAndEntities("What is it like on Tuesday.")) # print(nlu.DiscoverIntentAndEntities("How does it look like tomorrow?")) # print(nlu.DiscoverIntentAndEntities("What is the weather like next week?")) # print(nlu.DiscoverIntentAndEntities("What is it like tomorrow between 3 to 4pm ?")) # print(nlu.DiscoverIntentAndEntities("What is it like tomorrow afternoon?")) # print(nlu.DiscoverIntentAndEntities("What is it like tomorrow at noon?")) # print(nlu.DiscoverIntentAndEntities("What is it like tomorrow evening?")) # print(nlu.DiscoverIntentAndEntities("What is it like tomorrow morning?")) # print(nlu.DiscoverIntentAndEntities("What is it like tomorrow night?")) # print(nlu.DiscoverIntentAndEntities("What is it like tomorrow at midnight?")) # print(nlu.DiscoverIntentAndEntities("What is it like tomorrow during noon?")) # print(nlu.DiscoverIntentAndEntities("What is it like tomorrow around 11:00 in the morning?")) # print(nlu.DiscoverIntentAndEntities("What is it like next Tuesday around 11:00 in the morning?")) # print(nlu.DiscoverIntentAndEntities("What is it like next Tuesday between 10:00 and 11:00 in the night?")) # from nltk import word_tokenize, pos_tag, ne_chunk # # import os # import collections # # from nltk.stem.snowball import SnowballStemmer # from nltk.chunk import conlltags2tree, tree2conlltags # import string # # # def features(tokens, index, history): # """ # `tokens` = a POS-tagged sentence [(w1, t1), ...] # `index` = the index of the token we want to extract features for # `history` = the previous predicted IOB tags # """ # # # init the stemmer # stemmer = SnowballStemmer('english') # # # Pad the sequence with placeholders # tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), # ('[END2]', '[END2]')] # history = ['[START2]', '[START1]'] + list(history) # # # shift the index with 2, to accommodate the padding # index += 2 # # word, pos = tokens[index] # prevword, prevpos = tokens[index - 1] # prevprevword, prevprevpos = tokens[index - 2] # nextword, nextpos = tokens[index + 1] # nextnextword, nextnextpos = tokens[index + 2] # previob = history[index - 1] # contains_dash = '-' in word # contains_dot = '.' in word # allascii = all([True for c in word if c in string.ascii_lowercase]) # # allcaps = word == word.capitalize() # capitalized = word[0] in string.ascii_uppercase # # prevallcaps = prevword == prevword.capitalize() # prevcapitalized = prevword[0] in string.ascii_uppercase # # nextallcaps = prevword == prevword.capitalize() # nextcapitalized = prevword[0] in string.ascii_uppercase # # return { # 'word': word, # 'lemma': stemmer.stem(word), # 'pos': pos, # 'all-ascii': allascii, # # 'next-word': nextword, # 'next-lemma': stemmer.stem(nextword), # 'next-pos': nextpos, # # 'next-next-word': nextnextword, # 'nextnextpos': nextnextpos, # # 'prev-word': prevword, # 'prev-lemma': stemmer.stem(prevword), # 'prev-pos': prevpos, # # 'prev-prev-word': prevprevword, # 'prev-prev-pos': prevprevpos, # # 'prev-iob': previob, # # 'contains-dash': contains_dash, # 'contains-dot': contains_dot, # # 'all-caps': allcaps, # 'capitalized': capitalized, # # 'prev-all-caps': prevallcaps, # 'prev-capitalized': prevcapitalized, # # 'next-all-caps': nextallcaps, # 'next-capitalized': nextcapitalized, # } # # def to_conll_iob(annotated_sentence): # """ # `annotated_sentence` = list of triplets [(w1, t1, iob1), ...] # Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O # to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O # """ # proper_iob_tokens = [] # for idx, annotated_token in enumerate(annotated_sentence): # tag, word, ner = annotated_token # # if ner != 'O': # if idx == 0: # ner = "B-" + ner # elif annotated_sentence[idx - 1][2] == ner: # ner = "I-" + ner # else: # ner = "B-" + ner # proper_iob_tokens.append((tag, word, ner)) # return proper_iob_tokens # # # def read_gmb(corpus_root): # for root, dirs, files in os.walk(corpus_root): # for filename in files: # if filename.endswith(".tags"): # with open(os.path.join(root, filename), 'rb') as file_handle: # file_content = file_handle.read().decode('utf-8').strip() # annotated_sentences = file_content.split('\n\n') # for annotated_sentence in annotated_sentences: # annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq] # # standard_form_tokens = [] # # for idx, annotated_token in enumerate(annotated_tokens): # annotations = annotated_token.split('\t') # word, tag, ner = annotations[0], annotations[1], annotations[3] # # if ner != 'O': # ner = ner.split('-')[0] # # if tag in ('LQU', 'RQU'): # Make it NLTK compatible # tag = "``" # # standard_form_tokens.append((word, tag, ner)) # # conll_tokens = to_conll_iob(standard_form_tokens) # # # Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...] # # Because the classfier expects a tuple as input, first item input, second the class # yield [((w, t), iob) for w, t, iob in conll_tokens] # # # import pickle # from collections import Iterable # from nltk.tag import ClassifierBasedTagger # from nltk.chunk import ChunkParserI # # # class NamedEntityChunker(ChunkParserI): # def __init__(self, train_sents, **kwargs): # assert isinstance(train_sents, Iterable) # # self.feature_detector = features # self.tagger = ClassifierBasedTagger( # train=train_sents, # feature_detector=features, # **kwargs) # # def parse(self, tagged_sent): # chunks = self.tagger.tag(tagged_sent) # # # Transform the result from [((w1, t1), iob1), ...] # # to the preferred list of triplets format [(w1, t1, iob1), ...] # iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # # # Transform the list of triplets to nltk.Tree format # return conlltags2tree(iob_triplets) # # # corpus_root = "gmb-2.2.0/gmb-2.2.0" # Make sure you set the proper path to the unzipped corpus # reader = read_gmb(corpus_root) # training_samples = list(reader) # chunker = NamedEntityChunker(training_samples[:2000]) # print(chunker.parse(pos_tag(word_tokenize("What's the weather like in Pittsburgh this Monday."))))
class Streambot: """Stream Twitter and look for tweets that contain targeted words, when tweets found look for datetime and room, if present save tweet to OutgoingTweet model. Ex. bot = Streambot() # to run a stream looking for tweets about PyCon bot.run_stream(["PyCon"]) """ def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) self.slacker = Slacker(s.SLACK_TOKEN) def setup_auth(self): """Set up auth stuff for api and return tweepy api object""" auth = tweepy.OAuthHandler(s.listener["CONSUMER_KEY"], s.listener["CONSUMER_SECRET"]) auth.set_access_token(s.listener["ACCESS_TOKEN"], s.listener["ACCESS_TOKEN_SECRET"]) api = tweepy.API(auth) return api def run_stream(self, search_list=None): """Start stream, when matching tweet found on_status method called. search_list arg is a list of terms that will be looked for in tweets """ if search_list == None: raise ValueError( "Need a list of search terms as arg to run_stream") stream = tweepy.Stream(auth=self.api.auth, listener=self.stream_listener) stream.filter(track=search_list) def send_mention_tweet(self, screen_name): """Mention a user in a tweet from bot letting them know that their tweet has been recieved and that we will send out reminders about thier event. """ time = datetime.datetime.now() mention = "@{} We saw your openspaces tweet! {}".format( screen_name, time) self.api.update_status(status=mention) def parse_time_room(self, tweet): """Get time and room number from a tweet using SUTime and tweet_utils""" extracted_time = self.sutime.parse(tweet) time_and_room = tweet_utils.get_time_and_room(tweet, extracted_time) return time_and_room def retweet_logic(self, tweet, tweet_id, screen_name, user_id): """Use SUTime to try to parse a datetime out of a tweet, if successful save tweet to OutgoingTweet to be retweeted """ # use SUTime to parse a datetime out of tweet time_room = self.parse_time_room(tweet) # make sure both time and room extracted and only one val each val_check = [val for val in time_room.values() if len(val) == 1] if len(val_check) == 2: talk_room = time_room["room"][0] parsed_time = time_room["date"][0] talk_time = time_utils.convert_to_utc(parsed_time) # check for a time and room conflict, only one set of retweets per event conflict = db_utils.check_time_room_conflict(talk_time, talk_room) if not conflict: # send message to slack when a tweet is scheduled to go out slack_message = "{} From: {}, id: {}".format( tweet, screen_name, user_id) self.slacker.chat.post_message('#outgoing_tweets', slack_message) self.send_mention_tweet(screen_name) # This record lets us check that retweets not for same event db_utils.create_event(description=tweet, start=talk_time, location=talk_room, creator=screen_name) # schedules reminder tweets to be sent out before event tweet_utils.schedule_tweets(screen_name, tweet, tweet_id, talk_time)
jar_files = os.path.join(os.path.dirname(__file__), 'jars') sutime = SUTime(jars=jar_files, mark_time_ranges=True, include_range=True) lemmatizer = nltk.WordNetLemmatizer() #----------------------------------------------------------------------------------------------------------------------- #LOAD THE SENTENCES filepath = 'kolbuszowa.txt' list_sentences = [] with open(filepath,encoding="utf8") as file: for line in file: list_sentences.append([line[:line.rfind(".") + 1]]) #PREPROCESSING START for i in range(len(list_sentences)): sentence = list_sentences[i][0] #PREPROCESSING jsn = json.dumps(sutime.parse(sentence), sort_keys=True, indent=4) d = json.loads(jsn) if (len(d) >0): sentence = sentence.replace(d[0]['text'], d[0]['value']) for key in synonyms.keys(): sentence = sentence.replace(key,synonyms[key]) #CREATE DEPENDANCY TREE result = list(sdp.raw_parse(sentence)) tree = get_tree(result[0],4) dep_tree_dot_repr = tree.to_dot() #source = Source(dep_tree_dot_repr, filename="dep_tree", format="png") #source.view()
class Streambot: """Stream Twitter and look for tweets that contain targeted words, when tweets found look for datetime and room, if present save tweet to OutgoingTweet model. Ex. bot = Streambot() # to run a stream looking for tweets about PyCon bot.run_stream(["PyCon"]) """ def __init__(self): db_utils.setup_outgoing_config( ) # needs an outgoing config obj to check against self.api = self.setup_auth() self.stream_listener = StreamListener(self) jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) self.slacker = Slacker(s.SLACK_TOKEN) def setup_auth(self): """Set up auth stuff for api and return tweepy api object""" auth = tweepy.OAuthHandler(s.test_bot["CONSUMER_KEY"], s.test_bot["CONSUMER_SECRET"]) auth.set_access_token(s.test_bot["ACCESS_TOKEN"], s.test_bot["ACCESS_TOKEN_SECRET"]) api = tweepy.API(auth) return api def run_stream(self, search_list=None): """Start stream, when matching tweet found on_status method called. search_list arg is a list of terms that will be looked for in tweets """ if search_list == None: raise ValueError( "Need a list of search terms as arg to run_stream") stream = tweepy.Stream(auth=self.api.auth, listener=self.stream_listener) stream.filter(track=search_list) def send_mention_tweet(self, screen_name, room, time): """Mention a user in a tweet from bot letting them know that their tweet has been recieved and that we will send out reminders about their event. """ mention = "@{} saw your openspaces tweet for: room {} at {}. Times should be relative to US/Pacific" mention = mention.format(screen_name, room, time) self.api.update_status(status=mention) def value_check(self, time_room_obj): """Returns a tuple with the counts of values extracted from a tweet in the parse_time_room method. This tuple is used to decide how bot will respond to tweet. """ num_room_values = len(time_room_obj["room"]) num_time_values = len(time_room_obj["date"]) return (num_room_values, num_time_values) def parse_time_room(self, tweet): """Get time and room number from a tweet using SUTime and tweet_utils""" extracted_time = self.sutime.parse(tweet) time_and_room = tweet_utils.get_time_and_room(tweet, extracted_time) return time_and_room def retweet_logic(self, tweet, tweet_id, screen_name, user_id): """Use SUTime to try to parse a datetime out of a tweet, if successful save tweet to OutgoingTweet to be retweeted """ # use SUTime to parse a datetime out of tweet time_room = self.parse_time_room(tweet) # make sure both time and room extracted and only one val each val_check = self.value_check(time_room) if val_check == (1, 1): room = time_room["room"][0] date_mention = tweet_utils.check_date_mention(tweet) converted_time = time_utils.convert_to_utc(time_room["date"][0], date_mention) # check for a time and room conflict, only 1 set of retweets per event # default time range that a room is resrved for is -15 +30 mins conflict = db_utils.check_time_room_conflict(converted_time, room) if not conflict: event_obj = db_utils.create_event(description=tweet, start=converted_time, location=room, creator=screen_name) tweet_utils.schedule_tweets(screen_name, tweet, tweet_id, converted_time, event_obj) # slack_msg = "{} From: {}, id: {}".format(tweet, screen_name, user_id) # self.send_slack_message('#outgoing_tweets', slack_message) send_slack_message(user_id=user_id, tweet_id=tweet_id, screen_name=screen_name, tweet_created=True, tweet=tweet, slack_msg=tweet) self.send_mention_tweet(screen_name, room, converted_time) else: message = f"Tweet found for an already scheduled event: {tweet}" send_slack_message(user_id=user_id, tweet_id=tweet_id, screen_name=screen_name, tweet_created=False, tweet=tweet, slack_msg=message, channel="conflict") elif val_check == (0, 0): # tweet found but without valid time or room extracted, ignore pass else: # tweet with relevant information but not exactly 1 time & 1 room slack_msg = """Tweet found that needs review: {} tweet_id: {} screen_name: {}, user_id: {}""" slack_msg = slack_msg.format(tweet, tweet_id, screen_name, user_id) # self.send_slack_message("#need_review", message) send_slack_message(user_id=user_id, tweet_id=tweet_id, screen_name=screen_name, tweet_created=False, tweet=tweet, slack_msg=slack_msg) def loadtest_logic(self, tweet, tweet_id, screen_name, user_id): """Logic similar to what is being used in the real bot so that we can load test how much volume it can handle before twitter kicks it off """ # use SUTime to parse a datetime out of tweet time_room = self.parse_time_room(tweet) # fake time in the future that imitates a event's start time local_tz = pytz.timezone('US/Eastern') sample_time = datetime.datetime.now(local_tz) + datetime.timedelta( minutes=10) sample_time = sample_time.strftime("%Y-%m-%d %H:%M:%S") event_time = time_utils.convert_to_utc(sample_time) room = random.randint(0, 3000) # check for a time and room conflict, only 1 set of retweets per event conflict = db_utils.check_time_room_conflict(event_time, room) if not conflict: # This record lets us check that retweets not for same event event_obj = db_utils.create_event(description=tweet, start=event_time, location=room, creator=screen_name) tweet_utils.loadtest_schedule_tweets(screen_name=screen_name, tweet=tweet, tweet_id=tweet_id, event_time=event_time, event_obj=event_obj) print("tweet scheduled for retweet: {}".format(tweet)) slack_msg = "{} From: {}, id: {}".format(tweet, screen_name, user_id) # self.send_slack_message('#outgoing_tweets', slack_message) send_slack_message(user_id=user_id, tweet_id=tweet_id, screen_name=screen_name, tweet_created=True, tweet=tweet, slack_msg=slack_msg, event_obj=event_obj) else: print("conflict when scheduling the tweet")
import re import sys import codecs from sutime import SUTime # 41WM1234 asciiFile = str(sys.argv[1]) trinomials = re.compile("41[a-zA-Z]{2}[0-9]{1,4}") lineNum = 0 with codecs.open(asciiFile, 'r', encoding='utf-8', errors='ignore') as f: for line in f: #result = trinomials.search(line) result = trinomials.findall(line) if result is not None: for item in result: print("Site " + str(item) + ", line " + str(lineNum)) print(SUTime.parse(line, reference_date='')) lineNum += 1
class Streambot: """Stream Twitter and look for tweets that contain targeted words, when tweets found look for datetime and room, if present save tweet to OutgoingTweet model. Ex. bot = Streambot() # to run a stream looking for tweets about PyCon bot.run_stream(["PyCon"]) """ def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) self.slacker = Slacker(s.SLACK_TOKEN) def setup_auth(self): """Set up auth stuff for api and return tweepy api object""" auth = tweepy.OAuthHandler(s.sender["CONSUMER_KEY"], s.sender["CONSUMER_SECRET"]) auth.set_access_token(s.sender["ACCESS_TOKEN"], s.sender["ACCESS_TOKEN_SECRET"]) api = tweepy.API(auth) return api def run_stream(self, search_list=None): """Start stream, when matching tweet found on_status method called. search_list arg is a list of terms that will be looked for in tweets """ if search_list == None: raise ValueError( "Need a list of search terms as arg to run_stream") stream = tweepy.Stream(auth=self.api.auth, listener=self.stream_listener) stream.filter(track=search_list) def send_mention_tweet(self, screen_name, room, time): """Mention a user in a tweet from bot letting them know that their tweet has been recieved and that we will send out reminders about their event. """ mention = "@{} saw your openspaces tweet for: room {} at {}. Times should be relative to US/Pacific" mention = mention.format(screen_name, room, time) self.api.update_status(status=mention) def parse_time_room(self, tweet): """Get time and room number from a tweet using SUTime and tweet_utils""" extracted_time = self.sutime.parse(tweet) time_and_room = tweet_utils.get_time_and_room(tweet, extracted_time) return time_and_room def loadtest_logic(self, tweet, tweet_id, screen_name, user_id): """Logic similar to what is being used in the real bot so that we can load test how much volume it can handle before twitter kicks it off """ # use SUTime to parse a datetime out of tweet time_room = self.parse_time_room(tweet) # fake time in the future that imitates a event's start time local_tz = pytz.timezone('US/Pacific') sample_time = datetime.datetime.now(local_tz) + datetime.timedelta( minutes=10) sample_time = sample_time.strftime("%Y-%m-%d %H:%M:%S") converted_time = time_utils.convert_to_utc(sample_time) room = "r123" # check for a time and room conflict, only 1 set of retweets per event conflict = db_utils.check_time_room_conflict(converted_time, room) # send message to slack when a tweet is scheduled to go out slack_message = "{} From: {}, id: {}".format(tweet, screen_name, user_id) self.slacker.chat.post_message('#loadtest_tweets', slack_message) # This record lets us check that retweets not for same event db_utils.create_event(description=tweet, start=converted_time, location=room, creator=screen_name) tweet_utils.loadtest_schedule_tweets(screen_name, tweet, tweet_id, converted_time) print("tweet scheduled for retweet: {}".format(tweet))
class Streambot: """Stream Twitter and look for tweets that contain targeted words, when tweets found look for datetime and room, if present save tweet to OutgoingTweet model. Ex. bot = Streambot() # to run a stream looking for tweets about PyCon bot.run_stream(["PyCon"]) """ def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) def setup_auth(self): """Set up auth stuff for api and return tweepy api object""" auth = tweepy.OAuthHandler(s.listener["CONSUMER_KEY"], s.listener["CONSUMER_SECRET"]) auth.set_access_token(s.listener["ACCESS_TOKEN"], s.listener["ACCESS_TOKEN_SECRET"]) api = tweepy.API(auth) return api def run_stream(self, search_list=None): """Start stream, when matching tweet found on_status method called. search_list arg is a list of terms that will be looked for in tweets """ if search_list == None: raise ValueError( "Need a list of search terms as arg to run_stream") stream = tweepy.Stream(auth=self.api.auth, listener=self.stream_listener) stream.filter(track=search_list) def send_mention_tweet(self, screen_name, room, time): """Mention a user in a tweet from bot letting them know that their tweet has been recieved and that we will send out reminders about their event. """ mention = "@{} saw your openspaces tweet for: room {} at {}. Times should be relative to US/Pacific" mention = mention.format(screen_name, room, time) self.api.update_status(status=mention) def parse_time_room(self, tweet): """Get time and room number from a tweet using SUTime and tweet_utils""" extracted_time = self.sutime.parse(tweet) time_and_room = tweet_utils.get_time_and_room(tweet, extracted_time) return time_and_room def retweet_logic(self, tweet, tweet_id, screen_name): """Use SUTime to try to parse a datetime out of a tweet, if successful save tweet to OutgoingTweet to be retweeted """ # use SUTime to parse a datetime out of tweet time_room = self.parse_time_room(tweet) # make sure both time and room extracted and only one val each val_check = [val for val in time_room.values() if len(val) == 1] if len(val_check) == 2: room = time_room["room"][0] converted_time = time_utils.convert_to_utc(time_room["date"][0]) # check for a time and room conflict, only 1 set of retweets per event conflict = db_utils.check_time_room_conflict(converted_time, room) if not conflict: self.send_mention_tweet(screen_name, room, converted_time) # This record lets us check that retweets not for same event db_utils.create_event(description=tweet, start=converted_time, location=room, creator=screen_name) tweet_utils.schedule_tweets(screen_name, tweet, tweet_id, converted_time) loggly.info( "scheduled this tweet for retweet: {}".format(tweet)) else: message = """ Tweet recived for an event bot is already scheduled to retweet about. Sender: {}, room: {}, time: {}, tweet: {} """ message = message.format(screen_name, room, converted_time, tweet) loggly.info(message) else: # tweet found but without valid time or room extracted, ignore pass