예제 #1
0
 def length(batch: DataPanel, columns: list):
     try:
         # Take advantage of previously stored Spacy information
         return [
             len(doc) for doc in lookup(batch, SpacyOp, columns)
         ]
     except AttributeError:
         # If unavailable, fall back to splitting text
         return [len(text.split()) for text in batch[columns[0]]]
예제 #2
0
    def fn():
        nonlocal dp
        from robustnessgym import lookup

        dp = op(dp=dp, columns=["question"])

        # Look it up when you need it
        capitalized_text = lookup(dp, op, ["question"])
        return capitalized_text
예제 #3
0
def stanza_example(dp):
    columns = [0.45, 0.05, 0.50]
    st.header("Run Stanza Workflow")
    from robustnessgym import lookup
    from robustnessgym.ops import StanzaOp

    # Run the Stanza pipeline on the 'question' column of the dataset
    stanza = StanzaOp()
    dp = stanza(dp=dp, columns=["question"])
    # adds a new column that is auto-named "StanzaOp(columns=['question'])"

    # Grab the Stanza column from the DataPanel using the lookup
    stanza_column = lookup(dp, stanza, ["question"])
    format_code(
        """
from robustnessgym import lookup
from robustnessgym.ops import StanzaOp

# Run the Stanza pipeline on the 'question' column of the dataset
stanza = StanzaOp()
dp = stanza(dp=dp, columns=['question'])
# adds a new column that is auto-named "StanzaOp(columns=['question'])"

# Grab the Stanza column from the DataPanel using the lookup
stanza_column = lookup(dp, stanza, ['question'])
        """,
        stanza_column._repr_pandas_(),
        columns=columns,
    )
    st.subheader("Columns contain Cells")
    format_code(
        """ 
cell = stanza_column[0]
cell
        """,
        stanza_column[0],
        columns=columns,
    )
    st.subheader("Cells can be treated like stanza objects")
    format_code(
        """ 
cell = stanza_column[0]
cell.text
        """,
        stanza_column[0].text,
        columns=columns,
    )
    format_code(
        """ 
cell = stanza_column[0]
cell.entities
        """,
        stanza_column[0].entities,
        columns=columns,
    )
예제 #4
0
    def test_apply(self):
        # Create the Spacy cached operation
        spacy = SpacyOp()

        # Apply it
        dataset = spacy(self.testbed.dataset, ["text"])
        print(dataset.column_names)

        # Retrieve information to test
        sentences = [doc.sents for doc in lookup(dataset, spacy, ["text"])]
        tokens = [list(doc) for doc in lookup(dataset, spacy, ["text"])]
        entities = [doc.ents for doc in lookup(dataset, spacy, ["text"])]
        num_tokens = [len(list(doc)) for doc in lookup(dataset, spacy, ["text"])]

        self.assertEqual(
            sentences,
            [
                ["The man is walking."],
                ["The man is running."],
                ["The woman is sprinting."],
                ["The woman is resting."],
                ["The hobbit is flying."],
                ["The hobbit is swimming."],
            ],
        )

        self.assertEqual(
            tokens,
            [
                ["The", "man", "is", "walking", "."],
                ["The", "man", "is", "running", "."],
                ["The", "woman", "is", "sprinting", "."],
                ["The", "woman", "is", "resting", "."],
                ["The", "hobbit", "is", "flying", "."],
                ["The", "hobbit", "is", "swimming", "."],
            ],
        )

        self.assertEqual(entities, [[], [], [], [], [], []])
        self.assertEqual(num_tokens, [5, 5, 5, 5, 5, 5])
예제 #5
0
    def fn():
        nonlocal dp
        from robustnessgym import lookup
        from robustnessgym.ops import LazyTextBlobOp

        # Run the TextBlob pipeline on the 'passage' column of the dataset
        textblob = LazyTextBlobOp()
        dp = textblob(dp=dp, columns=["question"])
        # adds a new column that is auto-named "LazyTextBlobOp(columns=['question'])"

        # Grab the TextBlob column from the DataPanel using the lookup
        textblob_column = lookup(dp, textblob, ["question"])
        return textblob_column
예제 #6
0
    def fn():
        nonlocal dp, op
        from robustnessgym import lookup

        upper_text = lookup(dp, op, ["question"], "upper")
        return upper_text
예제 #7
0
    def fn():
        nonlocal dp, op
        from robustnessgym import lookup

        capitalized_text = lookup(dp, op, ["question"], "capitalize")
        return capitalized_text
예제 #8
0
def spacy_example(dp):
    columns = [0.45, 0.05, 0.50]

    with st.beta_container():
        st.header("Run Operation: `SpacyOp`")
        st.write("""
spaCy is a popular text processing library that provides tokenization, tagging 
and other capabilities. 
            """)

        from robustnessgym import lookup
        from robustnessgym.ops import SpacyOp

        # Run the Spacy pipeline on the 'question' column of the dataset
        spacy = SpacyOp()
        dp = spacy(dp=dp, columns=["passage"])
        # adds a new column that is auto-named
        # "SpacyOp(lang=en_core_web_sm, neuralcoref=False, columns=['passage'])"

        format_code(
            """
from robustnessgym import lookup
from robustnessgym.ops import SpacyOp

# Run the Spacy pipeline on the 'question' column of the dataset
spacy = SpacyOp()
dp = spacy(dp=dp, columns=['passage'])
# adds a new column that is auto-named
# "SpacyOp(lang=en_core_web_sm, neuralcoref=False, columns=['passage'])"
            """,
            dp.streamlit(),
            columns=columns,
        )

    st.write("------")

    with st.beta_container():
        spacy_column = lookup(dp, spacy, ["passage"])
        format_code(
            """
lookup(dp, spacy, ['passage'])
            """,
            spacy_column._repr_pandas_(),
            columns=columns,
        )

    with st.beta_container():
        st.subheader("Columns contain Cells")
        cell = spacy_column[1]
        format_code(
            """
cell = spacy_column[1]
cell
            """,
            cell,
            columns=columns,
        )

        format_code(
            """
list(cell)
            """,
            list(cell),
            columns=columns,
        )
        format_code(
            """
cell.ents
            """,
            cell.ents,
            columns=columns,
        )