def get_reuters_news_headlines(function_pointer,*args):
	from nlp import nltk_metricle
	import finviz
	firm_name = finviz.read_finviz_data('http://finviz.com/export.ashx?v=111&t='+args[0])['Company'][0].split()[0]
	headlines_list,abstract_list,time_stamp_list,anchor_list = function_pointer(*args)
	tokens_list_set = [set(nltk_metricle.find_features_from_POS(nltk_metricle.clean_document_return_features(headline,False),'NN','NNS','NNP','JJ','VB')) for headline in headlines_list]
	max_ = 1
	headlines_clean_list = []
	abstract_clean_list  = []
	timestamp_clean_list = []
	for i in xrange(0,len(tokens_list_set)-1):
		flag_ = False
		for j in xrange(i+1,len(tokens_list_set)):
			if (len(tokens_list_set[i].intersection(tokens_list_set[j])) >= max_) and (firm_name in tokens_list_set[i]):
				max_ = len(tokens_list_set[i].intersection(tokens_list_set[j]))
				headlines_clean_list.append(headlines_list[j])
				abstract_clean_list.append(abstract_list[j])
				timestamp_clean_list.append(time_stamp_list[j])
				flag_ = True
		if flag_ == True:
			headlines_clean_list.append(headlines_list[i])
			abstract_clean_list.append(abstract_list[i])
			timestamp_clean_list.append(time_stamp_list[i])
	
	df = pd.DataFrame(columns=['ticker','date','headlines','abstract','timestamp'], index=[x for x in xrange(0,len(headlines_clean_list))])				
	df['ticker'] = args[0] #ticker
	df['date'] = args[1] #Date
	df['timestamp'] = timestamp_clean_list
	df['headlines'] = headlines_clean_list
	df['abstract'] = abstract_clean_list
	return df
Пример #2
0
def get_reuters_news_headlines(function_pointer,*args):
	#from nlp import nltk_metricle
	import finviz
	nltk_metricle = imp.load_source('nltk_metricle',os.path.join(root_directory,'nlp','nltk_metricle.py'))
	firm_name = finviz.read_finviz_data('http://finviz.com/export.ashx?v=111&t='+args[0])['Company'][0]
	tokenized_firm_name_set = set(nltk_metricle.create_tokens(firm_name))
	headlines_list,abstract_list,time_stamp_list,anchor_list = function_pointer(*args)
	tokens_list_set = [set(nltk_metricle.find_features_from_POS(nltk_metricle.clean_document_return_features(headline,False),'NN','NNS','NNP','JJ','VB')) for headline in headlines_list]
	max_ = 1
	headlines_clean_list = []
	abstract_clean_list  = []
	timestamp_clean_list = []
	keywords_list_list = []
	for i in xrange(0,len(tokens_list_set)-1):
		flag_1 = False
		for j in xrange(i+1,len(tokens_list_set)):
			if (len(tokens_list_set[i].intersection(tokens_list_set[j])) >= max_) and (len(tokenized_firm_name_set.intersection(tokens_list_set[j])) > 0):
				max_ = len(tokens_list_set[i].intersection(tokens_list_set[j]))
				headlines_clean_list.append(headlines_list[j])
				abstract_clean_list.append(abstract_list[j])
				timestamp_clean_list.append(time_stamp_list[j])
				keywords_list_list.append(nltk_metricle.find_features_from_POS(list(tokens_list_set[j])))
				flag_1 = True
		if flag_1==True:
			headlines_clean_list.append(headlines_list[i])
			abstract_clean_list.append(abstract_list[i])
			timestamp_clean_list.append(time_stamp_list[i])
			keywords_list_list.append(nltk_metricle.find_features_from_POS(list(tokens_list_set[i])))
	
	df = pd.DataFrame(columns=['ticker','date','headlines','abstract','timestamp'], index=[x for x in xrange(0,len(headlines_clean_list))])				
	df['ticker'] = args[0] #ticker
	df['date'] = args[1] #Date
	df['timestamp'] = timestamp_clean_list
	df['headlines'] = headlines_clean_list
	df['abstract'] = abstract_clean_list
	df['keywords'] = keywords_list_list
	return df