def tweebo(texts): ''' Given a list of Strings will tokenise, pos tag and then dependecy parse the text using `Tweebo <https://github.com/ikekonglp/TweeboParser>`_ a Tweet specific parser. The Tweebo parser cannot handle no strings therefore a special empty string symbol is required. If one of the texts is an empty String then an empty list will be returned for that index of the returned list. :param texts: The texts that are to be parsed :type text: list :returns: A list of of a list of DependencyToken instances. A list per text \ in the texts argument. :rtype: list ''' def no_text(text): ''' Given a String checks if it is empty if so returns an empty_token else the text that was given. :param text: Text to be checked :type text: String :returns: The text if it is not empty or empty token if it is. :rtype: String ''' empty_token = '$$$EMPTY$$$' if text.strip() == '': return empty_token return text with tempfile.TemporaryDirectory() as working_dir: with tempfile.TemporaryDirectory() as temp_dir: text_file_path = os.path.join(temp_dir, 'text_file.txt') result_file_path = os.path.join(temp_dir, 'text_file.txt.predict') tweebo_dir = full_path(read_config('depdency_parsers')['tweebo_dir']) with open(text_file_path, 'w+') as text_file: for text in texts: text = no_text(text) text_file.write(text) text_file.write('\n') run_script = os.path.join(tweebo_dir, 'python_run.sh') if subprocess.run(['bash', run_script, text_file_path, working_dir]): with open(result_file_path, 'r') as result_file: return tweebo_post_process(result_file.read()) else: raise SystemError('Could not run the Tweebo run script {}'\ .format(run_script))
def get_lexicon(self): ''' Overrides :py:func@`tdparse.lexicons.Lexicon.get_lexicon` ''' sentiment_folder = full_path(read_config('lexicons')['hu_liu']) cats = ['positive', 'negative'] word_cat = [] for cat in cats: file_path = os.path.join(sentiment_folder, '{}-words.txt'.format(cat)) with open(file_path, 'r', encoding='cp1252') as senti_file: for line in senti_file: if re.search('^;', line) or re.search(r'^\W+', line): continue line = line.strip() word_cat.append((line.strip(), cat)) return word_cat
def get_lexicon(self): ''' Overrides :py:func:`tdparse.lexicons.Lexicon.get_lexicon` ''' emotion_file_path = full_path(read_config('lexicons')['nrc_emotion']) word_cat = [] with open(emotion_file_path, 'r', newline='') as emotion_file: tsv_reader = csv.reader(emotion_file, delimiter='\t') for row in tsv_reader: if len(row): word = row[0] cat = row[1] association = int(row[2]) if association: word_cat.append((word, cat)) return word_cat
def tweebo_install(tweebo_func): ''' Python decorator that ensures that `TweeboParser <https://github.com/ikekonglp/TweeboParser>`_ is installed, before running the function it wraps. Returns the given function. :param tweebo_func: A function that uses the Tweebo Parser. :type tweebo_func: function :returns: The given function :rtype: function ''' tweebo_dir = full_path(read_config('depdency_parsers')['tweebo_dir']) # If the models file exists then Tweebo has been installed or failed to # install tweebo_models = os.path.join(tweebo_dir, 'pretrained_models.tar.gz') if not os.path.isfile(tweebo_models): install_script = os.path.join(tweebo_dir, 'install.sh') subprocess.run(['bash', install_script]) return tweebo_func
def get_lexicon(self): ''' Overrides :py:func@`tdparse.lexicons.Lexicon.get_lexicon` ''' mpqa_file_path = full_path(read_config('lexicons')['mpqa']) word_cats = [] with open(mpqa_file_path, 'r') as mpqa_file: for line in mpqa_file: line = line.strip() if line: key_values = {} for data in line.split(): if '=' in data: key, value = data.split('=') key_values[key] = value word = key_values['word1'] cat = key_values['priorpolarity'] if cat == 'weakneg': cat = key_values['polarity'] word_cats.append((word, cat)) return word_cats
def train(data): if data == 'dong': vo_zhang_path = read_config('word2vec_files')['vo_zhang'] vo_zhang = GensimVectors(vo_zhang_path, None, model='word2vec') train_data = read_config('dong_twit_train_data') train_data = dong(train_data) test_data = read_config('dong_twit_test_data') test_data = dong(test_data) union_parameters = {'svm__C': [0.01]} union_pipeline = Pipeline( [('union', FeatureUnion([ ('left', Pipeline([ ('contexts', Context({'l'})), ('tokens', ContextTokeniser(ark_twokenize, False)), ('word_vectors', ContextWordVectors(vo_zhang)), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median))])), ('min_pipe', Pipeline([('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median))])), ('avg_pipe', Pipeline([('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median))])), ('prod_pipe', Pipeline([('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median))])), ('std_pipe', Pipeline([('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median))])) ])) ])), ('right', Pipeline([ ('contexts', Context({'r'})), ('tokens', ContextTokeniser(ark_twokenize, False)), ('word_vectors', ContextWordVectors(vo_zhang)), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median))])), ('min_pipe', Pipeline([('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median))])), ('avg_pipe', Pipeline([('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median))])), ('prod_pipe', Pipeline([('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median))])), ('std_pipe', Pipeline([('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median))])) ])) ])), ('target', Pipeline([ ('contexts', Context({'t'})), ('tokens', ContextTokeniser(ark_twokenize, False)), ('word_vectors', ContextWordVectors(vo_zhang)), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median))])), ('min_pipe', Pipeline([('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median))])), ('avg_pipe', Pipeline([('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median))])), ('prod_pipe', Pipeline([('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median))])), ('std_pipe', Pipeline([('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median))])) ])) ])), ('full', Pipeline([ ('contexts', Context({'f'})), ('tokens', ContextTokeniser(ark_twokenize, False)), ('word_vectors', ContextWordVectors(vo_zhang)), ('pool_funcs', FeatureUnion([ ('max_pipe', Pipeline([('max', NeuralPooling(matrix_max)), ('join', JoinContextVectors(matrix_median))])), ('min_pipe', Pipeline([('min', NeuralPooling(matrix_min)), ('join', JoinContextVectors(matrix_median))])), ('avg_pipe', Pipeline([('avg', NeuralPooling(matrix_avg)), ('join', JoinContextVectors(matrix_median))])), ('prod_pipe', Pipeline([('min', NeuralPooling(matrix_prod)), ('join', JoinContextVectors(matrix_median))])), ('std_pipe', Pipeline([('min', NeuralPooling(matrix_std)), ('join', JoinContextVectors(matrix_median))])) ])) ])) ])), ('svm', LinearSVC(C=0.01))]) train_y_values = [ target_dict['sentiment'] for target_dict in train_data ] test_y_values = [target_dict['sentiment'] for target_dict in test_data] #grid_search = GridSearchCV(union_pipeline, param_grid=union_parameters, # cv=5, scoring='accuracy', n_jobs=1) #grid_clf = grid_search.fit(train_data, train_y_values) union_pipeline.fit(train_data, train_y_values) preds = union_pipeline.predict(test_data) import code code.interact(local=locals())
# Metrics from sklearn.metrics import accuracy_score # Models from tdparse.models.tdlstm import TLSTM # Tokenisers from tdparse.tokenisers import whitespace, ark_twokenize # Word Vectors from tdparse.word_vectors import PreTrained, GloveTwitterVectors # Get the data from tdparse.helper import read_config, full_path from tdparse.parsers import dong # Load the datasets dong_train = dong(full_path(read_config('dong_twit_train_data'))) dong_test = dong(full_path(read_config('dong_twit_test_data'))) # Load the word vectors sswe_path = full_path(read_config('sswe_files')['vo_zhang']) sswe = PreTrained(sswe_path, name='sswe') #glove_50 = GloveTwitterVectors(50) #glove_100 = GloveTwitterVectors(100) #glove_200 = GloveTwitterVectors(200) params = [ [0.00001, 0.00001], [0.00001, 0.001], [0.00001, 0.1], [0.001, 0.00001], [0.001, 0.001], [0.001, 0.1],