Пример #1
def test_title_word_normalisation():
    tests = [
        ("word", "word"),
        ("tube", "tube"),
        ("light", "light"),
        ("sense", "sense"),
        ("domain", "domain"),
        ("jacobi", "jacobi"),
        ("chance", "chance"),
        ("12345", "12345"),
        ("12345-7", "12345-7"),
        ("123456", "123456"),
        ("123456-7", "123456"),
        ("scaling", "scaling"),
        ("strange", "strange"),
        ("semantic", "semant"),
        ("research", "researc"),
        ("citation", "citatio"),
        ("searching", "search"),
        ("retrieval", "retriev"),
        ("iostreams", "iostrea"),
        ("similarity", "similar"),
        ("co-occurrence", "co-occ"),
    test_framework.test_and_compare(tests, normalise_title_word, "Title word")
Пример #2
def test_title_word_normalisation():
  tests = [
    ("word", "word"),
    ("tube", "tube"),
    ("light", "light"),
    ("sense", "sense"),
    ("domain", "domain"),
    ("jacobi", "jacobi"),
    ("chance", "chance"),
    ("12345", "12345"),
    ("12345-7", "12345-7"),
    ("123456", "123456"),
    ("123456-7", "123456"),
    ("scaling", "scaling"),
    ("strange", "strange"),
    ("semantic", "semant"),
    ("research", "researc"),
    ("citation", "citatio"),
    ("searching", "search"),
    ("retrieval", "retriev"),
    ("iostreams", "iostrea"),
    ("similarity", "similar"),
    ("co-occurrence", "co-occ"),
  test_framework.test_and_compare(tests, normalise_title_word, "Title word")
Пример #3
def test_normalise_first_N_words_of_title():
  def normalise_first_3_title_words(title):
    N = 3
    return normalise_first_N_words_of_title(title, N)

  tests = [
    ("Experiments in word domain disambiguation for parallel texts",
    ("The role of domain information in word sense disambiguation",
    ("Automatic retrieval and clustering of similar words",
    ("A general framework for distributional similarity",
    ("A maximum entropy part-of-speech tagger",
    ("Domain-specific sense distributions and predominant sense acquisition",
    ("Sussx: WSD using automatically acquired predominant senses",
    ("Scaling context space",
    ("Co-occurrence Retrieval: A Flexible Framework for Lexical Distributional Similarity",
    ("Discovering corpus-specific word senses",
    ("Using domain information for word sense disambiguation",
    ("Web-scale distributional similarity and entity set expansion",
    ("From predicting predominant senses to local context for word sense disambiguation",
    ("Large-Scale Syntactic Processing: Parsing the Web Final Report of the 2009 JHU CLSP Workshop",
    ("Robust, applied morphological generation",
    ("From distributional to semantic similarity",
    ("TnT: a statistical part-of-speech tagger",
    ("A best-first probabilistic shift-reduce parser",
    ("Using automatically acquired predominant senses for word sense disambiguation",
    ("Evaluating WordNet-based Measures of Lexical Semantic Relatedness",
  test_framework.test_and_compare(tests, normalise_first_3_title_words, "First 3 title words")
Пример #4
def test_author_name_normalisation():
    tests = [
        ("Kay", "kay"),
        ("Mr.E", "mre"),
        ("Wang", "wang"),
        ("Curran", "curran"),
        ("Fekete", "fekete"),
        ("Manning", "manning"),
        ("Nothman", "nothman"),
        ("O'Keefe", "okeefe"),
        ("Ringland", "ringlan"),
        ("Koprinska", "koprins"),
        ("Kummerfeld", "kummerf"),
        ("Balasuriya", "balasur"),
        (u"M\u00FCller", u'muller'),  # Contains a non-ASCII character
        ("Durrant-White", "durrant"),
    test_framework.test_and_compare(tests, normalise_lastname, "Author name")
Пример #5
def test_author_name_normalisation():
  tests = [
    ("Kay", "kay"),
    ("Mr.E", "mre"),
    ("Wang", "wang"),
    ("Curran", "curran"),
    ("Fekete", "fekete"),
    ("Manning", "manning"),
    ("Nothman", "nothman"),
    ("O'Keefe", "okeefe"),
    ("Ringland", "ringlan"),
    ("Koprinska", "koprins"),
    ("Kummerfeld", "kummerf"),
    ("Balasuriya", "balasur"),
    (u"M\u00FCller", u'muller'),  # Contains a non-ASCII character
    ("Durrant-White", "durrant"),
  test_framework.test_and_compare(tests, normalise_lastname, "Author name")
Пример #6
def test_normalise_first_N_words_of_title():
    def normalise_first_3_title_words(title):
        N = 3
        return normalise_first_N_words_of_title(title, N)

    tests = [
        ("Experiments in word domain disambiguation for parallel texts",
        ("The role of domain information in word sense disambiguation",
        ("Automatic retrieval and clustering of similar words",
        ("A general framework for distributional similarity",
        ("A maximum entropy part-of-speech tagger", "maximum-entropy-part-of"),
        ("Domain-specific sense distributions and predominant sense acquisition",
        ("Sussx: WSD using automatically acquired predominant senses",
        ("Scaling context space", "scaling-context-space"),
        ("Co-occurrence Retrieval: A Flexible Framework for Lexical Distributional Similarity",
        ("Discovering corpus-specific word senses", "discov-corpus-word"),
        ("Using domain information for word sense disambiguation",
        ("Web-scale distributional similarity and entity set expansion",
        ("From predicting predominant senses to local context for word sense disambiguation",
        ("Large-Scale Syntactic Processing: Parsing the Web Final Report of the 2009 JHU CLSP Workshop",
        ("Robust, applied morphological generation", "robust-applied-morphol"),
        ("From distributional to semantic similarity",
        ("TnT: a statistical part-of-speech tagger", "tnt-statist-part-of"),
        ("A best-first probabilistic shift-reduce parser",
        ("Using automatically acquired predominant senses for word sense disambiguation",
        ("Evaluating WordNet-based Measures of Lexical Semantic Relatedness",
    test_framework.test_and_compare(tests, normalise_first_3_title_words,
                                    "First 3 title words")