Python Corpus示例，structures.Corpus Python示例

示例#1

0

显示文件

文件： stupid_splitter.py 项目： tchewik/rusclasp

def main():
    t1 = time.time()

    new_corpus = Corpus(u'/home/gree-gorey/stupid/')

    spans = 0

    for text in new_corpus.texts(u'json'):
        text.sentence_splitter()
        for sentence in text.sentences:

            sentence.stupid_span_splitter()

            spans += len(sentence.spans)

            for span in sentence.spans:
                span.get_boundaries()

        text.write_stupid_clause_ann()

        text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/stupid/')

    print spans

    t2 = time.time()

    print t2 - t1

示例#2

0

显示文件

文件： dummy_ann.py 项目： nasedkinav/rusclasp

def main():
    t1 = time.time()

    new_corpus = Corpus('/home/gree-gorey/CorpusTest/')

    for text in new_corpus.texts('txt'):

        text.write_dummy_ann()

        text.copy_into_brat('/opt/brat-v1.3_Crunchy_Frog/data/left/', True)

    t2 = time.time()

    print(t2 - t1)

示例#3

0

显示文件

文件： dummy_ann.py 项目： gree-gorey/rusclasp

def main():
    t1 = time.time()

    new_corpus = Corpus(u'/home/gree-gorey/CorpusTest/')

    for text in new_corpus.texts(u'txt'):

        text.write_dummy_ann()

        text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/left/', True)

    t2 = time.time()

    print t2 - t1

示例#4

0

显示文件

文件： main.py 项目： keni-m-patel/scitext-explorer

import logging
from structures import Corpus, Merge
from algorithms import Algorithm
from visualization import Visualization
import utilities

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(name)s %(levelname)s %(message)s',
                    filename='scitext.log',
                    filemode='w')
'''
TODO:
# ADD MERGE class and functionality with master config and then allow linek to all config classes

'''
config = utilities.get_config('./config/data/master.yaml')
print('\n\nreading from the following configuration files: \n\n ',
      config['config_files'])

corpi_list = [Corpus(config_file) for config_file in config['config_files']]
corpi = Merge([corpus() for corpus in corpi_list])

alg = Algorithm(corpi, './config/algorithms.yaml')
alg.run()

vis = Visualization('./config/visualization.yaml', './config/algorithms.yaml',
                    alg)
vis.run()

示例#5

0

显示文件

文件： split.py 项目： gree-gorey/rusclasp

def main():
    t1 = time.time()

    new_corpus = Corpus(u'/home/gree-gorey/CorpusTemp/')
    # new_corpus = Corpus(u'/home/gree-gorey/tested/')
    # new_corpus = Corpus(u'/home/gree-gorey/tested_tested/')

    for text in new_corpus.texts(u'json'):
        text.sentence_splitter()
        # print len(text.sentences)
        for sentence in text.sentences:
            # print sentence.tokens[0].content

            # for token in sentence.tokens:
            #     print token.pos, token.content, token.lex
            # print u'***************'

            # sentence.find_pp()

            # sentence.find_coordination()

            sentence.find_complimentizers()

            sentence.find_names()

            sentence.eliminate_pair_comma()

            # for token in sentence.tokens:
            #     print token.pos, token.content
            # print u'***************'
            # print

            sentence.span_splitter()

            sentence.get_shared_tokens()  # loop through all the spans 1

            sentence.split_double_complimentizers()  # loop through all the spans 2

            for span in sentence.spans:  # loop through all the spans 3

                # decide whether span is inserted or embedded or neither
                span.type()
                # print span.tokens[0].content, span.embedded_type

            # split embedded span if it contains > 1 predicate
            sentence.split_embedded()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content

            # walk through spans and join whenever possible
            sentence.restore_embedded()

            sentence.split_base()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content, span.finite()

            sentence.restore_base()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content, span.finite()

            for span in sentence.spans:
                span.get_boundaries()
                # print span.quasi_embedded, span.tokens[0].content

        text.write_clause_ann()

        text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/right/')

    t2 = time.time()

    print t2 - t1

示例#6

0

显示文件

def main():
    t1 = time.time()

    new_corpus = Corpus(u'/home/gree-gorey/CorpusTemp/')
    # new_corpus = Corpus(u'/home/gree-gorey/tested/')
    # new_corpus = Corpus(u'/home/gree-gorey/tested_tested/')

    for text in new_corpus.texts(u'json'):
        text.sentence_splitter()
        # print len(text.sentences)
        for sentence in text.sentences:
            # print sentence.tokens[0].content

            # for token in sentence.tokens:
            #     print token.pos, token.content, token.lex
            # print u'***************'

            # sentence.find_pp()

            # sentence.find_coordination()

            sentence.find_complimentizers()

            sentence.find_names()

            sentence.eliminate_pair_comma()

            # for token in sentence.tokens:
            #     print token.pos, token.content
            # print u'***************'
            # print

            sentence.span_splitter()

            sentence.get_shared_tokens()  # loop through all the spans 1

            sentence.split_double_complimentizers(
            )  # loop through all the spans 2

            for span in sentence.spans:  # loop through all the spans 3

                # decide whether span is inserted or embedded or neither
                span.type()
                # print span.tokens[0].content, span.embedded_type

            # split embedded span if it contains > 1 predicate
            sentence.split_embedded()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content

            # walk through spans and join whenever possible
            sentence.restore_embedded()

            sentence.split_base()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content, span.finite()

            sentence.restore_base()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content, span.finite()

            for span in sentence.spans:
                span.get_boundaries()
                # print span.quasi_embedded, span.tokens[0].content

        text.write_clause_ann()

        text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/right/')

    t2 = time.time()

    print t2 - t1

示例#7

0

显示文件

文件： main.py 项目： Akhonbay/scitext-explorer

from algorithms import Algorithm
from visualization import Visualization
import utilities

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(name)s %(levelname)s %(message)s',
                    filename='scitext.log',
                    filemode='w')
'''
TODO:
# ADD MERGE class and functionality with master config and then allow linek to all config classes

'''
print('\n\nreading from the following configuration files: \n\n ',
      utilities.get_config('./config/data/master.yaml')['config_files'])

corpus_list = [
    Corpus(config_file) for config_file in utilities.get_config(
        './config/data/master.yaml')['config_files']
]
corpi = Merge([corpus() for corpus in corpus_list])

alg = Algorithm(corpi, './config/algorithms.yaml')

alg_ran = alg.run()

vis = Visualization('./config/visualization.yaml', './config/algorithms.yaml',
                    alg_ran)

vis.run()