def evaluate(self, s, t): tweebo_api = API() text_data_s = [s] text_data_t = [t] max = 0 try: #parse the raw string into two different lanugage representation formats #result_stanford = tweebo_api.parse_stanford(text_data) result_conll_s = tweebo_api.parse_conll(text_data_s) result_conll_t = tweebo_api.parse_conll(text_data_t) nltk_result_s = self.add_root_node(result_conll_s) nltk_result_t = self.add_root_node(result_conll_t) dep_tree_s = DependencyGraph(nltk_result_s[0]).tree() dep_tree_t = DependencyGraph(nltk_result_t[0]).tree() #dep_tree.draw() paths_s = [] self.traverse(dep_tree_s,dep_tree_s.label(),paths_s) paths_t = [] self.traverse(dep_tree_t,dep_tree_t.label(),paths_t) #compute N-gram on paths max = 0 for string_s in paths_s: for string_t in paths_t: result = self.evaluateNgram(string_s,string_t) if result > max: max = result return max except ServerError as e: print(f'{e}\n{e.message}')
def test_api_stanford(): ''' Tests :py:func:`tweebo_parser.API.parse_stanford` where the output type \ is stanford styled. We perform the following tests: 1. 3 different sentences (one of the sentences contains a UTF character) 2. 5 sentences that include empty sentences. 3. Empty list ''' tweebo_api = API() stanford_0 = {'index': 0, 'tokens': TOKENS_0, 'basicDependencies': B_DEP_0} stanford_1 = {'index': 1, 'tokens': TOKENS_1, 'basicDependencies': B_DEP_1} stanford_2 = {'index': 2, 'tokens': TOKENS_2, 'basicDependencies': B_DEP_2} expected_return = [stanford_0, stanford_1, stanford_2] assert expected_return == tweebo_api.parse_stanford(TEST_SENTENCES_0) empty = {'index': 1, 'tokens': [], 'basicDependencies': []} last_empty = copy.deepcopy(empty) last_empty['index'] = 4 stanford_1['index'] = 2 stanford_2['index'] = 3 expected_return = [stanford_0, empty, stanford_1, stanford_2, last_empty] assert expected_return == tweebo_api.parse_stanford(TEST_SENTENCES_1) assert tweebo_api.parse_stanford([]) == []
def __callTweeboParser(self, cleanedTweets): """Parse the cleaned tweets by TweeboParser Python API. Arguments: cleanedTweets {list} -- the list of cleaned tweets Returns: result_conll -- [r1, r2, ...] """ tweebo_api = API() try: result_conll = tweebo_api.parse_conll(cleanedTweets) except ServerError as e: print(f'{e}\n{e.message}') result_conll_terms = [r.split("\n") for r in result_conll] return result_conll_terms
def convertToGraphs(self, X): tweebo_api = API() text_data = X graphs = dict() try: result_conll = tweebo_api.parse_conll( text_data) #ISSUE? Only finding 7 graphs! for i in tqdm(range(len(result_conll))): nltk_result = self.add_root_node(result_conll) dep_tree = DependencyGraph(nltk_result[0]).tree() graphs[X[i]] = dep_tree except ServerError as e: print(f'{e}\n{e.message}') print("-----Conversion Successful-----") return graphs
def test_api_conll(): ''' Tests :py:func:`tweebo_parser.API.parse_conll` where the output type is \ conll. We perform the following tests: 1. 3 different sentences (one of the sentences contains a UTF specific \ character) 2. 5 sentences that include empty sentences. 3. Empty list ''' tweebo_api = API() expected_return = [CONLL_0, CONLL_1, CONLL_2] assert expected_return == tweebo_api.parse_conll(TEST_SENTENCES_0) expected_return = [CONLL_0, '', CONLL_1, CONLL_2, ''] assert expected_return == tweebo_api.parse_conll(TEST_SENTENCES_1) assert tweebo_api.parse_conll([]) == []
def test_api_exceptions(): ''' Test that exceptions are raised when wrong input is given. 1. Test HTTPError raises when List of integers are given instead of a \ List of Strings. 2. Test HTTPError raises when a String is given instead of a List of \ Strings. ''' def cause_error(data: Any, exception: Any, api: API): functions = ['parse_conll', 'parse_stanford'] for function in functions: with pytest.raises(exception): getattr(api, function)(data) tweebo_api = API() cause_error([1], requests.exceptions.HTTPError, tweebo_api) cause_error('hello how are you', requests.exceptions.HTTPError, tweebo_api)
#--------- all tweebo-dependent code should be in this file ----------- #BEFORE RUNNING THIS CODE: #Install Docker https://docs.docker.com/get-docker/ #docker run -p 8000:8000 -d --rm mooreap/tweeboparserdocker import pandas as pd from tweebo_parser import API, ServerError filename = input('Enter the file name/path from here: ') start = int(input('Enter row number to start from (inclusive): ')) end = int(input('Enter row number to end at (exclusive): ')) out = input('Enter filename of output csv: ') file = pd.read_csv(filename, index_col=False) file = file.iloc[start:end, :] tweets = file['tweet'].tolist() tweebo_api = API() try: result = tweebo_api.parse_conll(tweets) except ServerError as e: print(f'{e}\n{e.message}') file['CoNLL'] = result file.to_csv(out, index=False)
def __new__(cls): if TweeboParser.instance is None: hostname, port = cls.get_config() TweeboParser.instance = API(hostname=hostname, port=port, log_errors=True) return TweeboParser.instance
This adds the ROOT relation to CoNLL formatted data. ''' temp_list_conll_sentences = [] for conll_sentences in list_conll_sentences: temp_conll_sentences = [] for sentence in conll_sentences.split('\n'): sentence = sentence.split('\t') if int(sentence[6]) == 0: sentence[7] = 'ROOT' temp_conll_sentences.append('\t'.join(sentence)) conll_sentences = '\n'.join(temp_conll_sentences) temp_list_conll_sentences.append(conll_sentences) return temp_list_conll_sentences tweebo_api = API() # Assumes server is running locally at 0.0.0.0:8000 text_data = [ '!!!!!!""@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!""', 'I can not just sit up and HATE on another bitch .. I got too much shit going on!' ] try: #parse the raw string into two different lanugage representation formats result_stanford = tweebo_api.parse_stanford(text_data) result_conll = tweebo_api.parse_conll(text_data) nltk_result = add_root_node(result_conll) nltk_dep_tree_0 = DependencyGraph(nltk_result[0]) nltk_dep_tree_1 = DependencyGraph(nltk_result[1]) #print(result_stanford) #print(result_conll)