Exemplo n.º 1
0
    def evaluate(self, s, t):
        tweebo_api = API()
        text_data_s = [s]
        text_data_t = [t]
        max = 0
        try:
            #parse the raw string into two different lanugage representation formats
            #result_stanford = tweebo_api.parse_stanford(text_data)
            result_conll_s = tweebo_api.parse_conll(text_data_s)
            result_conll_t = tweebo_api.parse_conll(text_data_t)

            nltk_result_s = self.add_root_node(result_conll_s)
            nltk_result_t = self.add_root_node(result_conll_t)
            dep_tree_s = DependencyGraph(nltk_result_s[0]).tree()
            dep_tree_t = DependencyGraph(nltk_result_t[0]).tree()
            #dep_tree.draw()
            paths_s = []
            self.traverse(dep_tree_s,dep_tree_s.label(),paths_s)

            paths_t = []
            self.traverse(dep_tree_t,dep_tree_t.label(),paths_t)

            #compute N-gram on paths
            
            max = 0
            for string_s in paths_s:
                for string_t in paths_t:
                    result = self.evaluateNgram(string_s,string_t)
                    if result > max:
                        max = result
            return max
            
        except ServerError as e:
            print(f'{e}\n{e.message}')
Exemplo n.º 2
0
def test_api_stanford():
    '''
    Tests :py:func:`tweebo_parser.API.parse_stanford` where the output type \
    is stanford styled. We perform the following tests:

    1. 3 different sentences (one of the sentences contains a UTF character)
    2. 5 sentences that include empty sentences.
    3. Empty list
    '''

    tweebo_api = API()
    stanford_0 = {'index': 0, 'tokens': TOKENS_0, 'basicDependencies': B_DEP_0}
    stanford_1 = {'index': 1, 'tokens': TOKENS_1, 'basicDependencies': B_DEP_1}
    stanford_2 = {'index': 2, 'tokens': TOKENS_2, 'basicDependencies': B_DEP_2}
    expected_return = [stanford_0, stanford_1, stanford_2]
    assert expected_return == tweebo_api.parse_stanford(TEST_SENTENCES_0)

    empty = {'index': 1, 'tokens': [], 'basicDependencies': []}
    last_empty = copy.deepcopy(empty)
    last_empty['index'] = 4
    stanford_1['index'] = 2
    stanford_2['index'] = 3
    expected_return = [stanford_0, empty, stanford_1, stanford_2, last_empty]
    assert expected_return == tweebo_api.parse_stanford(TEST_SENTENCES_1)

    assert tweebo_api.parse_stanford([]) == []
Exemplo n.º 3
0
    def __callTweeboParser(self, cleanedTweets):
        """Parse the cleaned tweets by TweeboParser Python API.

        Arguments:
            cleanedTweets {list} -- the list of cleaned tweets
        Returns:
            result_conll -- [r1, r2, ...]
        """
        tweebo_api = API()
        try:
            result_conll = tweebo_api.parse_conll(cleanedTweets)
        except ServerError as e:
            print(f'{e}\n{e.message}')
        result_conll_terms = [r.split("\n") for r in result_conll]
        return result_conll_terms
    def convertToGraphs(self, X):
        tweebo_api = API()
        text_data = X
        graphs = dict()
        try:
            result_conll = tweebo_api.parse_conll(
                text_data)  #ISSUE? Only finding 7 graphs!

            for i in tqdm(range(len(result_conll))):
                nltk_result = self.add_root_node(result_conll)
                dep_tree = DependencyGraph(nltk_result[0]).tree()
                graphs[X[i]] = dep_tree

        except ServerError as e:
            print(f'{e}\n{e.message}')

        print("-----Conversion Successful-----")
        return graphs
Exemplo n.º 5
0
def test_api_conll():
    '''
    Tests :py:func:`tweebo_parser.API.parse_conll` where the output type is \
    conll. We perform the following tests:

    1. 3 different sentences (one of the sentences contains a UTF specific \
    character)
    2. 5 sentences that include empty sentences.
    3. Empty list
    '''

    tweebo_api = API()
    expected_return = [CONLL_0, CONLL_1, CONLL_2]
    assert expected_return == tweebo_api.parse_conll(TEST_SENTENCES_0)

    expected_return = [CONLL_0, '', CONLL_1, CONLL_2, '']
    assert expected_return == tweebo_api.parse_conll(TEST_SENTENCES_1)

    assert tweebo_api.parse_conll([]) == []
Exemplo n.º 6
0
def test_api_exceptions():
    '''
    Test that exceptions are raised when wrong input is given.

    1. Test HTTPError raises when List of integers are given instead of a \
    List of Strings.
    2. Test HTTPError raises when a String is given instead of a List of \
    Strings.
    '''

    def cause_error(data: Any, exception: Any, api: API):
        functions = ['parse_conll', 'parse_stanford']
        for function in functions:
            with pytest.raises(exception):
                getattr(api, function)(data)

    tweebo_api = API()
    cause_error([1], requests.exceptions.HTTPError, tweebo_api)
    cause_error('hello how are you', requests.exceptions.HTTPError, tweebo_api)
Exemplo n.º 7
0
#--------- all tweebo-dependent code should be in this file -----------
#BEFORE RUNNING THIS CODE:
#Install Docker https://docs.docker.com/get-docker/
#docker run -p 8000:8000 -d --rm mooreap/tweeboparserdocker
import pandas as pd
from tweebo_parser import API, ServerError

filename = input('Enter the file name/path from here: ')
start = int(input('Enter row number to start from (inclusive): '))
end = int(input('Enter row number to end at (exclusive): '))
out = input('Enter filename of output csv: ')

file = pd.read_csv(filename, index_col=False)

file = file.iloc[start:end, :]

tweets = file['tweet'].tolist()

tweebo_api = API()
try:
    result = tweebo_api.parse_conll(tweets)
except ServerError as e:
    print(f'{e}\n{e.message}')

file['CoNLL'] = result

file.to_csv(out, index=False)
Exemplo n.º 8
0
 def __new__(cls):
     if TweeboParser.instance is None:
         hostname, port = cls.get_config()
         TweeboParser.instance = API(hostname=hostname, port=port, 
                                     log_errors=True)
     return TweeboParser.instance
Exemplo n.º 9
0
    This adds the ROOT relation to CoNLL formatted data.
    '''
    temp_list_conll_sentences = []
    for conll_sentences in list_conll_sentences:
        temp_conll_sentences = []
        for sentence in conll_sentences.split('\n'):
            sentence = sentence.split('\t')
            if int(sentence[6]) == 0:
                sentence[7] = 'ROOT'
            temp_conll_sentences.append('\t'.join(sentence))
        conll_sentences = '\n'.join(temp_conll_sentences)
        temp_list_conll_sentences.append(conll_sentences)
    return temp_list_conll_sentences


tweebo_api = API()  # Assumes server is running locally at 0.0.0.0:8000
text_data = [
    '!!!!!!""@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!""',
    'I can not just sit up and HATE on another bitch .. I got too much shit going on!'
]
try:
    #parse the raw string into two different lanugage representation formats
    result_stanford = tweebo_api.parse_stanford(text_data)
    result_conll = tweebo_api.parse_conll(text_data)

    nltk_result = add_root_node(result_conll)
    nltk_dep_tree_0 = DependencyGraph(nltk_result[0])
    nltk_dep_tree_1 = DependencyGraph(nltk_result[1])

    #print(result_stanford)
    #print(result_conll)