def test_example1(self):
     tokens = []
     tokens.append(
         Token(1, b'Kiten', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2, b'Master', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(3, b':', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(4,
               b'http://youtu.be/jVVD0OZk-6g',
               spacy_is_punct=False,
               spacy_like_url=True))
     sentence_uniqueID = 'p339_s003'
     text = 'Kiten Master: http://youtu.be/jVVD0OZk-6g'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_document(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 4, 1, 1, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)
 def test_count_polarity_bearing_tokens_example2(self):
     tokens = []
     tokens.append(Token(token_index_in_sentence=1, text=None))
     tokens.append(Token(token_index_in_sentence=2, text=None))
     tokens.append(Token(token_index_in_sentence=3, text=None))
     thf_sentence = THFSentenceExport(None, None, None, tokens, None, 1)
     feature_value = sentiws_polarity_bearing_tokens_feature.count_polarity_bearing_tokens(
         thf_sentence)
     expected_value = [0]
     self.assertEqual(feature_value, expected_value)
 def test_extract_average_polarity_example2(self):
     tokens = []
     tokens.append(Token(token_index_in_sentence=1, text=None))
     tokens.append(Token(token_index_in_sentence=2, text=None))
     tokens.append(Token(token_index_in_sentence=3, text=None))
     thf_sentence = THFSentenceExport(None, None, None, tokens, None, 1)
     feature_value = polarity_sentiws_feature.extract_average_polarity(
         thf_sentence)
     expected_value = [0.0]
     self.assertEqual(feature_value, expected_value)
 def test_example6(self):
     tokens = []
     tokens.append(
         Token(1, b'kleine', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2,
               b'Elektrodrohnen',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(3, b'just', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(4, b'for', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(5, b'fun', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(6, b',', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(7, b'warum', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(8, b'nicht', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(9, b'.', spacy_is_punct=True,
                         spacy_like_url=False))
     sentence_uniqueID = 'p339_s003'
     text = 'kleine Elektrodrohnen just for fun, warum nicht.'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_document(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 1.0 / len(tokens), 1.0 / len(tokens), 9, 0, 2, 1.0, 0.0, 0.0,
         0.0
     ]
     self.assertEqual(feature_value, expected_value)
 def test_count_different_ner_labels_example5(self):
     tokens = []
     tokens.append(Token(1, b'Vorbild', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(2, b'New', spacy_pos_universal_google='PROPN', spacy_ner_type='LOC', spacy_ner_iob='B'))
     tokens.append(Token(3, b'York', spacy_pos_universal_google='PROPN', spacy_ner_type='LOC', spacy_ner_iob='I'))
     tokens.append(Token(4, b':', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(5, b'http://www.houndsandpeople.com/de/magazin/kultur/new-york-city-dogs-teil-und-seele-der-weltmetropole/', spacy_pos_universal_google='AUX', spacy_ner_type='', spacy_ner_iob='O'))
     thf_sentence = THFSentenceExport(None, None,
                                      "Vorbild New York: http://www.houndsandpeople.com/de/magazin/kultur/new-york-city-dogs-teil-und-seele-der-weltmetropole/",
                                      tokens, None, 1)
     feature_value = ner_feature.count_different_ner_labels(thf_sentence.tokens)
     expected_value = np.array([0, 1, 0], dtype=np.float64)
     self.assertEqual(np.array_equal(feature_value, expected_value), True)
 def test_example4(self):
     tokens = []
     tokens.append(
         Token(1, b'Hier', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2, b'eine', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(3, b'Konzept', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(4, b'-', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(5, b'Grafik', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(6, b':', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(7, b' ', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(8,
               b'http://i.imgur.com/JGlqExO.jpg',
               spacy_is_punct=False,
               spacy_like_url=True))
     sentence_uniqueID = 'p339_s003'
     text = 'Hier eine Konzept-Grafik:  http://i.imgur.com/JGlqExO.jpg'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_document(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 8, 1, 2, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)
def load_v3(file_path='data/THF/sentence/subtaskA_train.json',
            group_claims=True):
    logger.debug(u'Parsing JSON File: {}'.format(file_path))
    sentences = []
    with open(file_path, encoding='utf-8') as data_file:
        data = json.load(data_file)
        for sentence in data:
            sentence_tokens = sentence["NLP"]["tokens"]
            tokens = []
            for token in sentence_tokens:
                token.pop("embedding")
                token.pop(
                    "mate_tools_pos_tag"
                )  # remove legacy token information, replaced by spaCy
                token.pop(
                    "mate_tools_lemma"
                )  # remove legacy token information, replaced by spaCy
                token.pop(
                    "pos_tag"
                )  # remove legacy token information, replaced by spaCy
                token.pop(
                    "tree_tagger_lemma"
                )  # remove legacy token information, replaced by spaCy
                token_model = Token(**token)
                tokens.append(token_model)
            dependencies = []
            dependency_tokens = sentence["NLP"]["dependencies"]
            for dependency in dependency_tokens:
                dependency_model = Dependency(**dependency)
                dependencies.append(dependency_model)
            label = sentence["Label"]
            if group_claims:
                if label == 'ClaimContra' or label == 'ClaimPro':
                    label = 'Claim'
            sentence_model = THFSentenceExport(sentence["UniqueID"],
                                               label,
                                               sentence["Text"],
                                               tokens,
                                               dependencies,
                                               textdepth=sentence["TextDepth"])
            sentences.append(sentence_model)
    logger.info('Parsed {} sentences'.format(len(sentences)))
    return sentences
def load(file_path='data/THF/sentence/subtaskA_train.json', group_claims=True):
    """
    Loads the THF corpus from an JSON file
    :param file_path: relative path to the JSON file
    :return:
    """
    logger.debug(u'Parsing JSON File: {}'.format(file_path))
    sentences = []
    with open(file_path, encoding='utf-8') as data_file:
        data = json.load(data_file)
        for sentence in data:
            sentence_tokens = sentence["NLP"]["Sentences"][0]["Tokens"]
            tokens = []
            dependencies = []
            for token in sentence_tokens:
                token_model = Token(token["TokenIndexInSentence"],
                                    token["Text"],
                                    pos_tag=token["POSTag"],
                                    iwnlp_lemma=parse_IWNLP_lemma(
                                        token.get("IWNLPLemma", None)),
                                    polarity=parse_polarity(
                                        (token.get("Polarity", None))))
                tokens.append(token_model)
            dependency_tokens = sentence["NLP"]["Sentences"][0]["Dependencies"]
            for dependency in dependency_tokens:
                dependency_model = Dependency(
                    dependency["TokenID"], dependency["DependencyRelation"],
                    dependency["DependencyHeadTokenID"])
                dependencies.append(dependency_model)
            label = sentence["Label"]
            if group_claims:
                if label == 'ClaimContra' or label == 'ClaimPro':
                    label = 'Claim'
            sentence_model = THFSentenceExport(sentence["UniqueID"], label,
                                               sentence["Text"], tokens,
                                               dependencies)
            sentences.append(sentence_model)
    logger.info('Parsed {} sentences'.format(len(sentences)))
    return sentences
예제 #9
0
 def process_document(self, text):
     result = self.nlp(text)
     tokens = []
     dependencies = []
     for index, token in enumerate(result):
         token_model = Token(index + 1,
                             text=token.text,
                             spacy_pos_stts=token.tag_,
                             spacy_pos_universal_google=token.pos_,
                             iwnlp_lemma=token._.iwnlp_lemmas,
                             spacy_ner_type=token.ent_type_,
                             spacy_ner_iob=token.ent_iob_,
                             spacy_is_punct=token.is_punct,
                             spacy_is_space=token.is_space,
                             spacy_like_num=token.like_num,
                             spacy_like_url=token.like_url,
                             spacy_shape=token.shape_,
                             polarity_sentiws=token._.sentiws)
         tokens.append(token_model)
         dependency_model = Dependency(token.i + 1, token.dep_,
                                       token.head.i + 1)
         dependencies.append(dependency_model)
     return {'tokens': tokens, 'dependencies': dependencies}
 def test_count_different_ner_labels_example3(self):
     tokens = []
     tokens.append(Token(1, b'Auf', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(2, b'dem', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(3, b'Tempelhofer', spacy_pos_universal_google='ADJ', spacy_ner_type='LOC', spacy_ner_iob='B'))
     tokens.append(Token(4, b'Feld', spacy_pos_universal_google='NOUN', spacy_ner_type='LOC', spacy_ner_iob='I'))
     tokens.append(Token(5, b'stehen', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(6, b'22', spacy_pos_universal_google='NUM', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(7, b'kleinere', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(8, b'Geb\xc3\xa4ude', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(9, b',', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(10, b'gr\xc3\xb6\xc3\x9ftenteils', spacy_pos_universal_google='ADV', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(11, b'ungenutzt', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(12, b'.', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
     thf_sentence = THFSentenceExport(None, None,
                                      "Auf dem Tempelhofer Feld stehen 22 kleinere Gebäude, größtenteils ungenutzt.",
                                      tokens, None, 1)
     feature_value = ner_feature.count_different_ner_labels(thf_sentence.tokens)
     expected_value = np.array([0, 1, 0], dtype=np.float64)
     self.assertEqual(np.array_equal(feature_value, expected_value), True)
 def test_count_different_ner_labels_example2(self):
     tokens = []
     tokens.append(Token(1, b'Bei', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(2, b'der', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(3, b'Stauraumplanung', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(4, b'wird', spacy_pos_universal_google='AUX', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(5, b'es', spacy_pos_universal_google='PRON', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(6, b'aus', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(7, b'allen', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(8, b'Gullis', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(9, b'demn\xc3\xa4chst', spacy_pos_universal_google='ADV', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(10, b'stinken', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(11, b'und', spacy_pos_universal_google='CONJ', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(12, b'Abwaaserlagerung', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(13, b'in', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(14, b'Schiffen', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(15, b'auf', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(16, b'der', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(17, b'Spree', spacy_pos_universal_google='PROPN', spacy_ner_type='LOC', spacy_ner_iob='B'))
     tokens.append(Token(18, b'd\xc3\xbcrfte', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(19, b'auch', spacy_pos_universal_google='ADV', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(20, b'nicht', spacy_pos_universal_google='PART', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(21, b'gesund', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(22, b'sein', spacy_pos_universal_google='AUX', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(23, b'.', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
     thf_sentence = THFSentenceExport(None, None,
                                      "Bei der Stauraumplanung wird es aus allen Gullis demnächst stinken und Abwaaserlagerung in Schiffen auf der Spree dürfte auch nicht gesund sein.",
                                      tokens, None, 1)
     feature_value = ner_feature.count_different_ner_labels(thf_sentence.tokens)
     expected_value = np.array([0, 1, 0], dtype=np.float64)
     self.assertEqual(np.array_equal(feature_value, expected_value), True)
 def test_count_different_ner_labels_example4(self):
     tokens = []
     tokens.append(Token(1, b'Oder', spacy_pos_universal_google='CONJ', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(2, b',', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(3, b'um', spacy_pos_universal_google='SCONJ', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(4, b'mit', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(5, b'Hermann', spacy_pos_universal_google='PROPN', spacy_ner_type='PERSON', spacy_ner_iob='B'))
     tokens.append(Token(6, b'Hesse', spacy_pos_universal_google='PROPN', spacy_ner_type='PERSON', spacy_ner_iob='I'))
     tokens.append(Token(7, b'zu', spacy_pos_universal_google='PART', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(8, b'sprechen', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(9, b':', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(10, b'Jedem', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(11, b'Ende', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(12, b'wohnt', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(13, b'ein', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(14, b'neuer', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(15, b'Anfang', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(16, b'inne', spacy_pos_universal_google='PART', spacy_ner_type='', spacy_ner_iob='O'))
     tokens.append(Token(17, b'.', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
     thf_sentence = THFSentenceExport(None, None,
                                      "Oder, um mit Hermann Hesse zu sprechen: Jedem Ende wohnt ein neuer Anfang inne.",
                                      tokens, None, 1)
     feature_value = ner_feature.count_different_ner_labels(thf_sentence.tokens)
     expected_value = np.array([1, 0, 0], dtype=np.float64)
     self.assertEqual(np.array_equal(feature_value, expected_value), True)
    def test_count_different_ner_labels_example1(self):
        tokens = []
        tokens.append(Token(1, b'Wenn', spacy_pos_universal_google='SCONJ', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(2, b'ich', spacy_pos_universal_google='PRON', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(3, b'durch', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(4, b'den', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(5, b'Hans', spacy_pos_universal_google='PROPN', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(6, b'-', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(7, b'Baluschek', spacy_pos_universal_google='PROPN', spacy_ner_type='PERSON', spacy_ner_iob='B'))
        tokens.append(Token(8, b'-', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(9, b'Park', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(10, b'radle', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(11, b',', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(12, b'riecht', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(13, b'es', spacy_pos_universal_google='PRON', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(14, b'immer', spacy_pos_universal_google='ADV', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(15, b'stark', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(16, b'vom', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(17, b'angrenzenden', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(18, b'S\xc3\xbcdgel\xc3\xa4nde', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(19, b'nach', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(20, b'Farbd\xc3\xbcnsten', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(21, b'und', spacy_pos_universal_google='CONJ', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(22, b'das', spacy_pos_universal_google='PRON', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(23, b'passt', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(24, b'nicht', spacy_pos_universal_google='PART', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(25, b'wirklich', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(26, b'zum', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(27, b'Naturschutz', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O'))
        tokens.append(Token(28, b'.', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O'))

        thf_sentence = THFSentenceExport(None, None,
                                         "Wenn ich durch den Hans-Baluschek-Park radle, riecht es immer stark vom angrenzenden Südgelände nach Farbdünsten und das passt nicht wirklich zum Naturschutz.",
                                         tokens, None, 1)
        feature_value = ner_feature.count_different_ner_labels(thf_sentence.tokens)
        expected_value = np.array([1, 0, 0], dtype=np.float64)
        self.assertEqual(np.array_equal(feature_value, expected_value), True)
 def test_example2(self):
     tokens = []
     tokens.append(
         Token(1, b'Diesen', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2, b'Vorschlag', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(3, b'gibt', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(4, b'es', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(5, b'schon', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(6, b':', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(7,
               b'S\xc3\xbcdlicher',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(8, b'Zugang', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(9, b'zur', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(10,
               b'Oberlandstra\xc3\x9fe',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(11, b'(', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(12, b'Hatun', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(13, b'-', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(14,
               b'S\xc3\xbcr\xc3\xbcc\xc3\xbc',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(15, b'-', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(16,
               b'Br\xc3\xbccke',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(17, b')', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(18, b'(', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(
             19,
             b'https://tempelhofer-feld.berlin.de/i/tempelhofer-feld/proposal/104-S%C3%BCdlicher_Zugang_zur_Oberlandstra%C3%9Fe_Hatu',
             spacy_is_punct=False,
             spacy_like_url=True))
     tokens.append(
         Token(20, b')', spacy_is_punct=True, spacy_like_url=False))
     sentence_uniqueID = 'p339_s003'
     text = 'Diesen Vorschlag gibt es schon: S\u00fcdlicher Zugang zur Oberlandstra\u00dfe (Hatun-S\u00fcr\u00fcc\u00fc-Br\u00fccke) (https://tempelhofer-feld.berlin.de/i/tempelhofer-feld/proposal/104-S%C3%BCdlicher_Zugang_zur_Oberlandstra%C3%9Fe_Hatu)'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_document(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 20, 1, 7, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)
 def test_example5(self):
     tokens = []
     tokens.append(
         Token(1,
               b'Tempelhofparikram',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(Token(2, b'-', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(3,
               b'Interreligi\xc3\xb6ser',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(4, b'Pilgerpfad', spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(5, b'auf', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(6, b'dem', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(7,
               b'Tempelhofer',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(8, b'Feld', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(9, b'(', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(10,
               b'http://lebensplan.com/Interreligioeser-Pilgerpfad.pdf',
               spacy_is_punct=False,
               spacy_like_url=True))
     tokens.append(
         Token(11, b')', spacy_is_punct=True, spacy_like_url=False))
     sentence_uniqueID = 'p339_s003'
     text = 'Tempelhofparikram - Interreligi\u00f6ser Pilgerpfad auf dem Tempelhofer Feld (http://lebensplan.com/Interreligioeser-Pilgerpfad.pdf)'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_document(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 11, 1, 3, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)
 def test_example3(self):
     tokens = []
     tokens.append(
         Token(1, b'BM', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2, b'Tester', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(3, b'#', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(4, b'1', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(5, b':', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(6, b'Kite', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(7, b'-', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(8, b'Skaten', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(9, b'auf', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(10, b'dem', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(11,
               b'Tempelhofer',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(12, b'Feld', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(13, b':', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(14,
               b'http://youtu.be/Jf68D61QN4A',
               spacy_is_punct=False,
               spacy_like_url=True))
     sentence_uniqueID = 'p339_s003'
     text = 'BM Tester #1: Kite-Skaten auf dem Tempelhofer Feld: http://youtu.be/Jf68D61QN4A'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_document(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 14, 1, 4, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)