예제 #1
0
def tweebo(texts):
    '''
    Given a list of Strings will tokenise, pos tag and then dependecy parse
    the text using `Tweebo <https://github.com/ikekonglp/TweeboParser>`_
    a Tweet specific parser.

    The Tweebo parser cannot handle no strings therefore a special empty string
    symbol is required.

    If one of the texts is an empty String then an empty list will be returned
    for that index of the returned list.

    :param texts: The texts that are to be parsed
    :type text: list
    :returns: A list of of a list of DependencyToken instances. A list per text \
    in the texts argument.
    :rtype: list
    '''

    def no_text(text):
        '''
        Given a String checks if it is empty if so returns an empty_token else
        the text that was given.

        :param text: Text to be checked
        :type text: String
        :returns: The text if it is not empty or empty token if it is.
        :rtype: String
        '''

        empty_token = '$$$EMPTY$$$'
        if text.strip() == '':
            return empty_token
        return text
    with tempfile.TemporaryDirectory() as working_dir:
        with tempfile.TemporaryDirectory() as temp_dir:
            text_file_path = os.path.join(temp_dir, 'text_file.txt')
            result_file_path = os.path.join(temp_dir, 'text_file.txt.predict')
            tweebo_dir = full_path(read_config('depdency_parsers')['tweebo_dir'])
            with open(text_file_path, 'w+') as text_file:
                for text in texts:
                    text = no_text(text)
                    text_file.write(text)
                    text_file.write('\n')
            run_script = os.path.join(tweebo_dir, 'python_run.sh')
            if subprocess.run(['bash', run_script, text_file_path, working_dir]):
                with open(result_file_path, 'r') as result_file:
                    return tweebo_post_process(result_file.read())
            else:
                raise SystemError('Could not run the Tweebo run script {}'\
                                  .format(run_script))
예제 #2
0
    def get_lexicon(self):
        '''
        Overrides :py:func@`tdparse.lexicons.Lexicon.get_lexicon`
        '''

        sentiment_folder = full_path(read_config('lexicons')['hu_liu'])
        cats = ['positive', 'negative']
        word_cat = []
        for cat in cats:
            file_path = os.path.join(sentiment_folder, '{}-words.txt'.format(cat))
            with open(file_path, 'r', encoding='cp1252') as senti_file:
                for line in senti_file:
                    if re.search('^;', line) or re.search(r'^\W+', line):
                        continue
                    line = line.strip()
                    word_cat.append((line.strip(), cat))
        return word_cat
예제 #3
0
    def get_lexicon(self):
        '''
        Overrides :py:func:`tdparse.lexicons.Lexicon.get_lexicon`
        '''

        emotion_file_path = full_path(read_config('lexicons')['nrc_emotion'])
        word_cat = []

        with open(emotion_file_path, 'r', newline='') as emotion_file:
            tsv_reader = csv.reader(emotion_file, delimiter='\t')
            for row in tsv_reader:
                if len(row):
                    word = row[0]
                    cat = row[1]
                    association = int(row[2])
                    if association:
                        word_cat.append((word, cat))
        return word_cat
예제 #4
0
def tweebo_install(tweebo_func):
    '''
    Python decorator that ensures that
    `TweeboParser <https://github.com/ikekonglp/TweeboParser>`_ is installed,
    before running the function it wraps. Returns the given function.

    :param tweebo_func: A function that uses the Tweebo Parser.
    :type tweebo_func: function
    :returns: The given function
    :rtype: function
    '''

    tweebo_dir = full_path(read_config('depdency_parsers')['tweebo_dir'])
    # If the models file exists then Tweebo has been installed or failed to
    # install
    tweebo_models = os.path.join(tweebo_dir, 'pretrained_models.tar.gz')
    if not os.path.isfile(tweebo_models):
        install_script = os.path.join(tweebo_dir, 'install.sh')
        subprocess.run(['bash', install_script])
    return tweebo_func
예제 #5
0
 def get_lexicon(self):
     '''
     Overrides :py:func@`tdparse.lexicons.Lexicon.get_lexicon`
     '''
     mpqa_file_path = full_path(read_config('lexicons')['mpqa'])
     word_cats = []
     with open(mpqa_file_path, 'r') as mpqa_file:
         for line in mpqa_file:
             line = line.strip()
             if line:
                 key_values = {}
                 for data in line.split():
                     if '=' in data:
                         key, value = data.split('=')
                         key_values[key] = value
                 word = key_values['word1']
                 cat = key_values['priorpolarity']
                 if cat == 'weakneg':
                     cat = key_values['polarity']
                 word_cats.append((word, cat))
     return word_cats
예제 #6
0
def train(data):
    if data == 'dong':
        vo_zhang_path = read_config('word2vec_files')['vo_zhang']
        vo_zhang = GensimVectors(vo_zhang_path, None, model='word2vec')
        train_data = read_config('dong_twit_train_data')
        train_data = dong(train_data)
        test_data = read_config('dong_twit_test_data')
        test_data = dong(test_data)

        union_parameters = {'svm__C': [0.01]}

        union_pipeline = Pipeline(
            [('union',
              FeatureUnion([
                  ('left',
                   Pipeline([
                       ('contexts', Context({'l'})),
                       ('tokens', ContextTokeniser(ark_twokenize, False)),
                       ('word_vectors', ContextWordVectors(vo_zhang)),
                       ('pool_funcs',
                        FeatureUnion([
                            ('max_pipe',
                             Pipeline([('max', NeuralPooling(matrix_max)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('min_pipe',
                             Pipeline([('min', NeuralPooling(matrix_min)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('avg_pipe',
                             Pipeline([('avg', NeuralPooling(matrix_avg)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('prod_pipe',
                             Pipeline([('min', NeuralPooling(matrix_prod)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('std_pipe',
                             Pipeline([('min', NeuralPooling(matrix_std)),
                                       ('join',
                                        JoinContextVectors(matrix_median))]))
                        ]))
                   ])),
                  ('right',
                   Pipeline([
                       ('contexts', Context({'r'})),
                       ('tokens', ContextTokeniser(ark_twokenize, False)),
                       ('word_vectors', ContextWordVectors(vo_zhang)),
                       ('pool_funcs',
                        FeatureUnion([
                            ('max_pipe',
                             Pipeline([('max', NeuralPooling(matrix_max)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('min_pipe',
                             Pipeline([('min', NeuralPooling(matrix_min)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('avg_pipe',
                             Pipeline([('avg', NeuralPooling(matrix_avg)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('prod_pipe',
                             Pipeline([('min', NeuralPooling(matrix_prod)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('std_pipe',
                             Pipeline([('min', NeuralPooling(matrix_std)),
                                       ('join',
                                        JoinContextVectors(matrix_median))]))
                        ]))
                   ])),
                  ('target',
                   Pipeline([
                       ('contexts', Context({'t'})),
                       ('tokens', ContextTokeniser(ark_twokenize, False)),
                       ('word_vectors', ContextWordVectors(vo_zhang)),
                       ('pool_funcs',
                        FeatureUnion([
                            ('max_pipe',
                             Pipeline([('max', NeuralPooling(matrix_max)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('min_pipe',
                             Pipeline([('min', NeuralPooling(matrix_min)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('avg_pipe',
                             Pipeline([('avg', NeuralPooling(matrix_avg)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('prod_pipe',
                             Pipeline([('min', NeuralPooling(matrix_prod)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('std_pipe',
                             Pipeline([('min', NeuralPooling(matrix_std)),
                                       ('join',
                                        JoinContextVectors(matrix_median))]))
                        ]))
                   ])),
                  ('full',
                   Pipeline([
                       ('contexts', Context({'f'})),
                       ('tokens', ContextTokeniser(ark_twokenize, False)),
                       ('word_vectors', ContextWordVectors(vo_zhang)),
                       ('pool_funcs',
                        FeatureUnion([
                            ('max_pipe',
                             Pipeline([('max', NeuralPooling(matrix_max)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('min_pipe',
                             Pipeline([('min', NeuralPooling(matrix_min)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('avg_pipe',
                             Pipeline([('avg', NeuralPooling(matrix_avg)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('prod_pipe',
                             Pipeline([('min', NeuralPooling(matrix_prod)),
                                       ('join',
                                        JoinContextVectors(matrix_median))])),
                            ('std_pipe',
                             Pipeline([('min', NeuralPooling(matrix_std)),
                                       ('join',
                                        JoinContextVectors(matrix_median))]))
                        ]))
                   ]))
              ])), ('svm', LinearSVC(C=0.01))])

        train_y_values = [
            target_dict['sentiment'] for target_dict in train_data
        ]
        test_y_values = [target_dict['sentiment'] for target_dict in test_data]

        #grid_search = GridSearchCV(union_pipeline, param_grid=union_parameters,
        #                           cv=5, scoring='accuracy', n_jobs=1)
        #grid_clf = grid_search.fit(train_data, train_y_values)

        union_pipeline.fit(train_data, train_y_values)
        preds = union_pipeline.predict(test_data)

        import code
        code.interact(local=locals())
예제 #7
0
# Metrics
from sklearn.metrics import accuracy_score

# Models
from tdparse.models.tdlstm import TLSTM
# Tokenisers
from tdparse.tokenisers import whitespace, ark_twokenize
# Word Vectors
from tdparse.word_vectors import PreTrained, GloveTwitterVectors
# Get the data
from tdparse.helper import read_config, full_path
from tdparse.parsers import dong

# Load the datasets
dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
# Load the word vectors
sswe_path = full_path(read_config('sswe_files')['vo_zhang'])
sswe = PreTrained(sswe_path, name='sswe')
#glove_50 = GloveTwitterVectors(50)
#glove_100 = GloveTwitterVectors(100)
#glove_200 = GloveTwitterVectors(200)

params = [
    [0.00001, 0.00001],
    [0.00001, 0.001],
    [0.00001, 0.1],
    [0.001, 0.00001],
    [0.001, 0.001],
    [0.001, 0.1],