Пример #1
0
def traverseTree(tree):
    # get dependency tree
    global dfs_list
    if type(tree) == nltk.tree.Tree:
        dfs_list.append(tree.label())
    else:
        dfs_list.append(tree)
        #print("tree:", tree)
    for subtree in tree:
        if type(subtree) == nltk.tree.Tree:
            traverseTree(subtree)
        else:
            dfs_list.append(subtree)


DBconn = ctbcfxSQL()
# preprocess of news data

raw_data = DBconn.query(cols='*',
                        table='zerohedge',
                        condition_str='time > "2017-10-20 00:00:00"')
fomc_news = pd.DataFrame(raw_data)
fomc_news['content'] = fomc_news['content'].apply(lambda x: x.lower())

now = datetime.now()
# load spacy model
nlp = spacy.load('en')

# load entity & sentiment dictionary
entity_cb = pd.read_csv(r'C:\Users\z00013855\Desktop\treemodel\ctbc1026.csv',
                        encoding='utf-8')  #CTBC_mod5
Пример #2
0
import sys
import os
from lib.ctbcfxSQL import ctbcfxSQL
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta

DBconn = ctbcfxSQL()  #global SQL connection


#========= IMSERT FUNCTIONS =============
def cmod(tablename, filepath):
    print('start inserting file: "%s" to TABLE[%s]...' % (filepath, tablename))
    with open(filepath, 'r', encoding='utf8') as f:
        i = 0
        for line in f:
            if i > 0:
                line = line.replace('\n', '').replace('\r', '')
                data1 = str(line).split(',')
                word = data1[0]

                data2 = str(data1[1]).split('#')
                entity = data2[0]
                aspect = data2[1]

                DBconn.insert('entity', ['entity'], [entity])
                DBconn.insert('aspect', ['entity', 'aspect'], [entity, aspect])
                DBconn.insert('word', ['aspect', 'word'], [aspect, word])

            i += 1
Пример #3
0
def main():
	#initialize utilities
	print('[start mission!]\n--------------------------------')
	startTime = time.time()
	DBconn = ctbcfxSQL()

	#start crawlers & insert databases
	print('source: CNBC | crawler activated.')
	df = CNBC_Crawler(stoppage=25).execute()
	print('complete!')
	df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2))
	df.dropna(axis=0, how='any', inplace=True)
	dflist = df.values.tolist()
	print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...")
	for d in dflist:
		d[1] = str(d[1]).split('+')[0]
		DBconn.insert('cnbc_us',['title','time','content'],d)
	print('complete!')

	print('source: Forex factory | crawler activated.')
	df = Forexfactory_Crawler().execute()
	print('complete!')
	df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2))
	df.dropna(axis=0, how='any', inplace=True)
	dflist = df.values.tolist()
	print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...")
	for d in dflist:
		d[0] = str(d[0]).split('+')[0]
		DBconn.insert('forexfactory',['time','title','content'],d)
	print('complete!')

	print('source: Forex Live | crawler activated.')
	df = Forexlive_Crawler(stoppage=50).execute()
	print('complete!')
	df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2))
	df.dropna(axis=0, how='any', inplace=True)
	dflist = df.values.tolist()
	print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...")
	for d in dflist:
		d[0] = str(d[0]).split('+')[0]
		DBconn.insert('forexlive',['time','author','title','content'],d)
	print('complete!')

	print('source: Reuters Market | crawler activated.')
	df = ReutersMarket_Crawler(stoppage=50).execute()
	print('complete!')
	df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2))
	df.dropna(axis=0, how='any', inplace=True)
	dflist = df.values.tolist()
	print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...")
	for d in dflist:
		d[1] = str(d[1]).split('+')[0]
		DBconn.insert('reuters',['title','time','content'],d)
	print('complete!')

	print('source: Reuters World | crawler activated.')
	df = ReutersWorld_Crawler(stoppage=50).execute()
	print('complete!')
	df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2))
	df.dropna(axis=0, how='any', inplace=True)
	dflist = df.values.tolist()
	print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...")
	for d in dflist:
		d[1] = str(d[1]).split('+')[0]
		DBconn.insert('reuters',['title','time','content'],d)
	print('complete!')

	print('source: Zerohedge | crawler activated.')
	df = Zerohedge_Crawler(stoppage=25).execute()
	print('complete!')
	df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2))
	df.dropna(axis=0, how='any', inplace=True)
	dflist = df.values.tolist()
	print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...")
	for d in dflist:
		d[1] = str(d[1]).split('+')[0]
		DBconn.insert('zerohedge',['title','time','content','tag'],d)
	print('complete!')

	#delete duplicates
	print('start deleting duplicates:\nCNBC running',end="...")
	DBconn.crawler_DelDup('cnbc_us')
	print('DONE!\nForex Factory running',end="...")
	DBconn.crawler_DelDup('forexfactory')
	print('DONE!\nForex Live running',end="...")
	DBconn.crawler_DelDup('forexlive')
	print('DONE!\nReuters running',end="...")
	DBconn.crawler_DelDup('reuters')
	print('DONE!\nZerohedge running',end="...")
	DBconn.crawler_DelDup('zerohedge')
	print('DONE!')

	#process end & evaluate time consumed
	endTime = time.time()
	elapsed = endTime - startTime
	print('------------------------\ntime elapsed: %dm %.1fs'%(int(elapsed/60),(elapsed%60.0)))
	
	print('[endTime="%s"]\n'%(time.ctime()))