예제 #1
0
    def getTimelineTweets(self, date, max_tweets, topic):
        '''

        :param date: python datetime object
        :param max_tweets: Maximum number of tweets to return
        :param topic: Topic to search from
        :return: List of tuples, tuple format (date,tweet)
        '''
        from_date = date - datetime.timedelta(days=1)
        from_date = from_date.strftime("%Y-%m-%d")
        to_date = date + datetime.timedelta(days=1)
        to_date = to_date.strftime("%Y-%m-%d")
        curr_tweets = 0
        tweets = []
        for tweet in tweepy.Cursor(self.api.search,
                                   q=topic + ' -filter:retweets',
                                   since=from_date,
                                   until=to_date).items():
            tweet_date = tweet._json['created_at']
            tweet_text = tweet._json['text']
            if len(tag(tweet_text)) > len(
                    tweet_text):  # Filter out non timeline sentences
                tweets.append((tweet_date, tweet_text))
                curr_tweets += 1
            if curr_tweets >= max_tweets:
                break
        return tweets
예제 #2
0
def process_doc(single_doc, q_type, doc_num):
    # in order of type constants (0 : who, 1: where, 2: when)

    # BASELINE ONLY!
    # when NER doesn't work in nltk NER. we use timex.py to tag
    # https://github.com/nltk/nltk_contrib/blob/master/nltk_contrib/timex.py
    if q_type == WHEN_TYPE:
        sentences = nltk.tokenize.sent_tokenize(single_doc)
        surviving_sentences = []
        for sentence in sentences:
            sentence_after_tagging = timex.tag(sentence)
            if sentence_after_tagging.find('<TIMEX2>') != -1:
                surviving_sentences.append((doc_num, sentence))
        return surviving_sentences

    # http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb
    else:
        sentences = nltk.tokenize.sent_tokenize(single_doc)
        surviving_sentences = []
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            pos_tag = nltk.pos_tag(words)
            # this is in nltk tree
            # reference : http://www.nltk.org/howto/tree.html
            # http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb
            ner_tree = nltk.ne_chunk(pos_tag)
            # bool for whether this sentence has
            contains_tag = False
            for subtree in ner_tree.subtrees():
                if subtree.label() in NER_TAG[q_type]:
                    contains_tag = True
            if (contains_tag):
                surviving_sentences.append((doc_num, sentence))
        #print surviving_sentences
        return surviving_sentences
예제 #3
0
def answer_processing(s_tuple, q_type, q_keywords):
	#print "DOING ANSWER_PROCESSING"
	sentences = s_tuple
	# http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb
	# in string
	answers = []
	# NEED TO ACCOUNT FOR CASES IN WHICH THERE ARE LESS THAN 5 ANSWERS
	num_answers_needed = 5 - len(sentences)
	if(num_answers_needed > 0):
		for i in range(0,num_answers_needed):
			sentences.append(('100','nil'))
	for i in range(0, len(sentences)):
		doc_num = sentences[i][0]
		sentence = sentences[i][1]
		if q_type == WHEN_TYPE:
			sentence_after_tagging = timex.tag(sentence)
			when_answers = re.findall('<TIMEX2>(.*?)</TIMEX2>', sentence_after_tagging)
			# in case answer comes out as empty, output an empty string
			when_answer = when_answers[0] if len(when_answers) != 0 else 'nil'
			answers.append((doc_num, when_answer))

		else:		
			words = nltk.word_tokenize(sentence)
			pos_tag = nltk.pos_tag(words)
			ner_tree = nltk.ne_chunk(pos_tag)
			#print ner_tree
			# the list of tuples((word, pos),ner) to be considered for this sentence
			matching_tuples = []
			# print q_keywords
			global subtree
			tmp = []
			for subtree in ner_tree.subtrees():
				if subtree.label() in NER_TAG[q_type] and subtree.pos()[0][0][1]=='NNP':					
					word = ' '.join(map(lambda x : x[0][0], subtree.pos()))
					print word
					print q_keywords
					iskwin = map(lambda x : x in word, q_keywords)
					if not any(iskwin):						
						# print "SUBTREE!", subtree
						# matching_tuples = subtree.pos()
						answer = ' '.join(map(lambda x : x[0][0], subtree.pos()))
						if answer not in map(lambda x : x[1],answers):
							tmp.append((doc_num,answer))
						

			k_sorted = sort_keywords(sentence, tmp, q_keywords)
			answers+=k_sorted;
			print answers
			print "SENTENCE : ", sentence, "ANSWER : ", tmp
			# t : ((word, pos), ner)
			# answer = ''
			# for t in matching_tuples:
			# 	#print t
			# 	if t[0][0] not in q_keywords:
			# 		answer += t[0][0] + ' '
			# # remove any possible trailing whitespaces
			# answer = answer.rstrip()
			# answers.append((doc_num,answer))
	print answers
	return answers
예제 #4
0
def timex_parse(content, base_time=gmt()):
    """Timex tagger using the timex module.

    Ripped from nltk_contrib using regex.
    """
    tagged_text = tag(content)
    injected_base_text = ground(tagged_text, base_time)
    return injected_base_text
예제 #5
0
def get_date(result_entry, process_date):
    """
    Function to extract date from a story. First checks for a date from the RSS
    feed itself. Then tries to pull a date from the first two sentences of a
    story. Finally turns to the date that the story was added to the database.
    For the dates pulled from the story, the function checks whether the
    difference is greater than one day from the date that the pipeline is
    parsing.

    Parameters
    ----------

    result_entry: Dictionary.
                    Record of a single result from the web scraper.

    process_date: datetime object.
                    Datetime object indicating which date the pipeline is
                    processing. Standard is date_running - 1 day.


    Returns
    -------

    date : String.
            Date string in the form YYMMDD.

    """
    date_obj = ''
    if result_entry['date']:
        try:
            date_obj = parser.parse(result_entry['date'])
        except TypeError:
            date_obj = ''
    else:
        date_obj = ''

    if not date_obj:
        tagged = timex.tag(result_entry['content'][:2])
        dates = re.findall(r'<TIMEX2>(.*?)</TIMEX2>', tagged)
        if dates:
            try:
                date_obj = parser.parse(dates[0])
                diff_check = _check_date(date_obj, process_date)
                if diff_check:
                    date_obj = ''
            except TypeError:
                date_obj = ''
        else:
            date_obj = ''

    if not date_obj:
        date_obj = result_entry['date_added']

    date = '{}{:02d}{:02d}'.format(
        str(date_obj.year)[2:], date_obj.month, date_obj.day)

    return date
예제 #6
0
def get_date(result_entry, process_date):
    """
    Function to extract date from a story. First checks for a date from the RSS
    feed itself. Then tries to pull a date from the first two sentences of a
    story. Finally turns to the date that the story was added to the database.
    For the dates pulled from the story, the function checks whether the
    difference is greater than one day from the date that the pipeline is
    parsing.

    Parameters
    ----------

    result_entry: Dictionary.
                    Record of a single result from the web scraper.

    process_date: datetime object.
                    Datetime object indicating which date the pipeline is
                    processing. Standard is date_running - 1 day.


    Returns
    -------

    date : String.
            Date string in the form YYMMDD.

    """
    date_obj = ''
    if result_entry['date']:
        try:
            date_obj = parser.parse(result_entry['date'])
        except TypeError:
            date_obj = ''
    else:
        date_obj = ''

    if not date_obj:
        tagged = timex.tag(result_entry['content'][:2])
        dates = re.findall(r'<TIMEX2>(.*?)</TIMEX2>', tagged)
        if dates:
            try:
                date_obj = parser.parse(dates[0])
                diff_check = _check_date(date_obj, process_date)
                if diff_check:
                    date_obj = ''
            except TypeError:
                date_obj = ''
        else:
            date_obj = ''

    if not date_obj:
        date_obj = result_entry['date_added']

    date = '{}{:02d}{:02d}'.format(str(date_obj.year)[2:], date_obj.month,
                                   date_obj.day)

    return date
예제 #7
0
def answer_processing(s_tuple, q_type, q_keywords):
    sentences = s_tuple
    # http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb
    # in string
    answers = []
    # NEED TO ACCOUNT FOR CASES IN WHICH THERE ARE LESS THAN 5 ANSWERS
    num_answers_needed = 5 - len(sentences)
    if (num_answers_needed > 0):
        for i in range(0, num_answers_needed):
            sentences.append(('100', 'nil'))
    for i in range(0, 5):
        doc_num = sentences[i][0]
        sentence = sentences[i][1]
        # nltk NER doesn't work with when_type, let's use timex.py
        if q_type == WHEN_TYPE:
            sentence_after_tagging = timex.tag(sentence)
            when_answers = re.findall('<TIMEX2>(.*?)</TIMEX2>',
                                      sentence_after_tagging)
            # in case answer comes out as empty, output an empty string
            when_answer = when_answers[0] if len(when_answers) != 0 else ''
            answers.append((doc_num, when_answer))
        else:
            words = nltk.word_tokenize(sentence)
            pos_tag = nltk.pos_tag(words)
            ner_tree = nltk.ne_chunk(pos_tag)
            #print ner_tree
            # the list of tuples((word, pos),ner) to be considered for this sentence
            matching_tuples = []
            for subtree in ner_tree.subtrees():
                if subtree.label() in NER_TAG[q_type] and subtree.pos(
                )[0][0][1] == 'NNP':
                    print subtree
                    iskwin = map(lambda x: x in subtree.pos()[0][0][0],
                                 q_keywords)
                    if not any(iskwin):
                        matching_tuples = subtree.pos()
            # t : ((word, pos), ner)
            answer = ''
            for t in matching_tuples:
                #print t
                if t[0][0] not in q_keywords:
                    answer += t[0][0] + ' '
            # remove any possible trailing whitespaces
            answer = answer.rstrip()
            answers.append((doc_num, answer))
    print answers
    return answers
def performTagging(featureObjects):
    taggedLines = []
    for obj in featureObjects:
        taggedLine = ""
        try:
            taggedLine = timex.tag(
                obj.getLexicalFeatures().getSpellCorrection().lower())
            taggedLine = timex.ground(taggedLine, timex.gmt())
        except:
            taggedLine = ""

        if not Utilities.isEmpty(taggedLine):
            obj.getSyntacticFeatures().setTemporalTag(
                Utilities.firstMatching(TIMEX_TAG_REGEX, taggedLine))
            taggedLines.append(obj)

    return taggedLines
예제 #9
0
def getTime(newsid):
    """
    Do something to get the lastest date in the article, in Unix Time form (a.k.a. seconds from 1970/1/1)
    """
    flag = 0
    fp = open('database/'+newsid)
    soup = BeautifulSoup(fp)
    tagtimes = list()
    #content +=soup.find('title').get_text()
    for i in soup.findAll('p'):
        text = i.get_text()
        #    content += text.strip() + "\n"
        try:
            tagged = timex.ground(timex.tag(text), getBasetime(newsid))
        except ValueError:
            continue
        soup2 = BeautifulSoup(tagged)
        if soup2.timex2 != None:
	    flag = 1
            for i in soup2.findAll('timex2'):
                try:
                    # print "tagged time: " + str(i)
                    timestr = i['val']
                except KeyError:
                    print "Error tagged time: " + str(i)
                    continue
                if timestr != 'UNKNOWN':
                    try:
                        tagtimes.append(int(dateutil.parser.parse(timestr).strftime('%s')))
                    except ValueError:
                        continue
                else:
                    print i
    if flag ==0:
        # print("OMG no tags!")
        randtime = random.randint(1370016000,1416758400)
        print("Fail to get timetag from news " + str(newsid) + " , assign " + str(randtime))
        return int(randtime)
    else:
        if tagtimes != list():
            print("Time prediction for news " + str(newsid) + ": " + str(np.array(tagtimes).min()))
            return np.array(tagtimes).min()
        else:
            randtime = random.randint(1370016000,1416758400)
            print("Fail to get timetag from news " + str(newsid) + " , assign " + str(randtime))
            return int(randtime)
예제 #10
0
def timexWrapper(text):
    """
    wrap timex
    
    @type  text: list [word] (ordered by index)
    @param text: the text to be tagged with time expressions
    
    @rtype  tuple (list [TimeExpression], list[word])
    @return list of time expressions extracted from text and list of unmatched words 
    """
    
    text_str = " ".join([x.word for x in text])
    timeExpressions = []
    uncovered_tokens = [[x,False] for x in text]
    try:
        #timex's ground function isn't reliable
        ground_res = ground(tag(text_str),gmt())
    except:
        return ([],text)
    
    for s,val in ground_res[1]:
        textInd = text_str.find(s)
        curText = []
        numOfItems = len(s.split())
        startInd = len(text_str[:textInd].split())
        if textInd>0:
            if text_str[textInd-1] != " ":
                # deal with time expressions starting in the middle of words
                startInd-=1
        for i in range(startInd,startInd+numOfItems):
            uncovered_tokens[i][1] = True
            curText.append(text[i])
        timeExpressions.append(TimeExpression(curText,val))
    
    iterList = list(enumerate([x for x in uncovered_tokens]))
    iterList.reverse()
    for i,(x,flag) in iterList:
        if flag:
            del(uncovered_tokens[i])
        else:
            uncovered_tokens[i]=x
    
    return timeExpressions,uncovered_tokens
예제 #11
0
def timexWrapper(text):
    """
    wrap timex
    
    @type  text: list [word] (ordered by index)
    @param text: the text to be tagged with time expressions
    
    @rtype  tuple (list [TimeExpression], list[word])
    @return list of time expressions extracted from text and list of unmatched words 
    """

    text_str = " ".join([x.word for x in text])
    timeExpressions = []
    uncovered_tokens = [[x, False] for x in text]
    try:
        #timex's ground function isn't reliable
        ground_res = ground(tag(text_str), gmt())
    except:
        return ([], text)

    for s, val in ground_res[1]:
        textInd = text_str.find(s)
        curText = []
        numOfItems = len(s.split())
        startInd = len(text_str[:textInd].split())
        if textInd > 0:
            if text_str[textInd - 1] != " ":
                # deal with time expressions starting in the middle of words
                startInd -= 1
        for i in range(startInd, startInd + numOfItems):
            uncovered_tokens[i][1] = True
            curText.append(text[i])
        timeExpressions.append(TimeExpression(curText, val))

    iterList = list(enumerate([x for x in uncovered_tokens]))
    iterList.reverse()
    for i, (x, flag) in iterList:
        if flag:
            del (uncovered_tokens[i])
        else:
            uncovered_tokens[i] = x

    return timeExpressions, uncovered_tokens
예제 #12
0
def main():
    data_path = "./origin_data/riedel/nyt-2005-2006.backup/"
    output_path = "./data/"
    with open(output_path + "processed.txt", "wb") as fout:
        outputs = []
        for item in os.listdir(data_path):
            with open(data_path + item, "rb") as fin:
                if item[-3:] != ".pb":
                    continue
                # pdb.set_trace()
                doc = Document_pb2.Document()
                doc.ParseFromString(fin.read())
                # whole_doc is for time extraction
                whole_doc = []
                valid_set = []
                for sentence in doc.sentences:
                    s = []
                    m = []
                    t = []
                    # extract the token words into one sentence.
                    for token in sentence.tokens:
                        s.append(token.word)
                    # extract mentions
                    for mention in sentence.mentions:
                        # mention got entity_name, mform, to
                        m.append(["_".join(s[int(mention.mfrom):int(mention.to+1)]), mention.mfrom, mention.to])
                    # if mentions is smaller than 2, means this may not in my train-test set.
                    # in this case, the mention could be bigger than 2, so we may need iterations for further processing.
                    if len(m) < 2:
                        valid_set.append([m, s])
                    whole_doc.append(s)
                # tagging op should appear in each doc iter
                # since we need the whole doc to set base-time
                timex_found, whole_doc = timex.tag(" ".join(whole_doc))
                if len(timex_found) > 0:
                    # set base-time tobe the last time found.
                    base_t = timex.retrieve_Date_time(timex_found)

                pdb.set_trace()
예제 #13
0
def get_sentence_dates(url):

    # Extract sentences from the html
    response = urlopen(url)
    content = response.read()

    # Get only relevant text
    soup = BeautifulSoup(content)
    split = [elm.text.encode("utf-8") for elm in soup.findAll('p')]
    content = ' '.join(split)
    raw = nltk.clean_html(content)

    # Strip raw
    raw = re.sub(r'(\n)|\[.*?\] ?', "", raw)
    raw = re.sub(r'(\n)|\(.*?\) ?', "", raw)
    raw = re.sub(r"\r\n", ".", raw)
    #raw = re.sub(r";", ".", raw)
    #raw = re.sub(r"\"", ".", raw)    
    
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = sent_detector.tokenize(raw.strip())

    more_sentences = []
    for sentence in sentences:
        sentence = sentence.strip().split(".")
        more_sentences += sentence

    sentences = more_sentences

    #Clean sentences
    for index, sentence in enumerate(sentences):
        split_up = sentence.split()
        sentences[index] = ' '.join(split_up)

    sentence_date_maps = {}

    # Assign a date to each sentence
    for index, sentence in enumerate(sentences):    
        timed_sentence = timex.tag(str(sentence))
        for result in re.finditer(".*<TIMEX2>(.*)</TIMEX2>.*", timed_sentence):
            for r in result.groups():
                if r not in sentence_date_maps:
                    sentence_date_maps[r] = []
                sentence_date_maps[r].append(sentence)

    sentence_dates = []
    for date, sentences in sentence_date_maps.items():
        try:
            year = int(date)
            if year > START and year < END:
                #for sentence in sentences:
                #    res = []
                #    print(sentence)
                #    if len(sentence) > 10 and  len(sentence) < 500:
                #        res.append(sentence)
                #if res != []:
                #    sentence_dates.append((year, res))
                sentence_dates.append((year, sentences))
                        
        except:
            continue

    sentence_dates = sorted(sentence_dates, key=lambda x:x[0])

    return json.dumps(sentence_dates)
예제 #14
0
for tree in chunked:
    # print results per sentence
    # print extract_entity_names(tree)

    entity_names.extend(extract_entity_names(tree))

# print unique entity names
base_date = datetime.date.today()
#now = datetime.date.today()
#basedate = datetime.Date(now.year, now.month, now.day)

tagged_text = []
for sentence in sentences:
    #newsent = dateparser.parse(sentence)
    #newsent = search_dates(sentence)
    newsent = tag(sentence)
    #dt.append(newsent)
    tagged_text.append(newsent)
#dates = tag(sentences);

dt = []
#for string in tagged_text:
#dt = ground(tagged_text, base_date)
dt = ground(tagged_text, base_date)

print(dt)

#unique = set(entity_names)
#unique.append(dates)
#print(unique)