def main(): t1 = time.time() new_corpus = Corpus(u'/home/gree-gorey/stupid/') spans = 0 for text in new_corpus.texts(u'json'): text.sentence_splitter() for sentence in text.sentences: sentence.stupid_span_splitter() spans += len(sentence.spans) for span in sentence.spans: span.get_boundaries() text.write_stupid_clause_ann() text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/stupid/') print spans t2 = time.time() print t2 - t1
def main(): t1 = time.time() new_corpus = Corpus('/home/gree-gorey/CorpusTest/') for text in new_corpus.texts('txt'): text.write_dummy_ann() text.copy_into_brat('/opt/brat-v1.3_Crunchy_Frog/data/left/', True) t2 = time.time() print(t2 - t1)
def main(): t1 = time.time() new_corpus = Corpus(u'/home/gree-gorey/CorpusTest/') for text in new_corpus.texts(u'txt'): text.write_dummy_ann() text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/left/', True) t2 = time.time() print t2 - t1
import logging from structures import Corpus, Merge from algorithms import Algorithm from visualization import Visualization import utilities logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)s %(levelname)s %(message)s', filename='scitext.log', filemode='w') ''' TODO: # ADD MERGE class and functionality with master config and then allow linek to all config classes ''' config = utilities.get_config('./config/data/master.yaml') print('\n\nreading from the following configuration files: \n\n ', config['config_files']) corpi_list = [Corpus(config_file) for config_file in config['config_files']] corpi = Merge([corpus() for corpus in corpi_list]) alg = Algorithm(corpi, './config/algorithms.yaml') alg.run() vis = Visualization('./config/visualization.yaml', './config/algorithms.yaml', alg) vis.run()
def main(): t1 = time.time() new_corpus = Corpus(u'/home/gree-gorey/CorpusTemp/') # new_corpus = Corpus(u'/home/gree-gorey/tested/') # new_corpus = Corpus(u'/home/gree-gorey/tested_tested/') for text in new_corpus.texts(u'json'): text.sentence_splitter() # print len(text.sentences) for sentence in text.sentences: # print sentence.tokens[0].content # for token in sentence.tokens: # print token.pos, token.content, token.lex # print u'***************' # sentence.find_pp() # sentence.find_coordination() sentence.find_complimentizers() sentence.find_names() sentence.eliminate_pair_comma() # for token in sentence.tokens: # print token.pos, token.content # print u'***************' # print sentence.span_splitter() sentence.get_shared_tokens() # loop through all the spans 1 sentence.split_double_complimentizers() # loop through all the spans 2 for span in sentence.spans: # loop through all the spans 3 # decide whether span is inserted or embedded or neither span.type() # print span.tokens[0].content, span.embedded_type # split embedded span if it contains > 1 predicate sentence.split_embedded() # for span in sentence.spans: # print span.shared_tokens[0].content, span.tokens[0].content # walk through spans and join whenever possible sentence.restore_embedded() sentence.split_base() # for span in sentence.spans: # print span.shared_tokens[0].content, span.tokens[0].content, span.finite() sentence.restore_base() # for span in sentence.spans: # print span.shared_tokens[0].content, span.tokens[0].content, span.finite() for span in sentence.spans: span.get_boundaries() # print span.quasi_embedded, span.tokens[0].content text.write_clause_ann() text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/right/') t2 = time.time() print t2 - t1
def main(): t1 = time.time() new_corpus = Corpus(u'/home/gree-gorey/CorpusTemp/') # new_corpus = Corpus(u'/home/gree-gorey/tested/') # new_corpus = Corpus(u'/home/gree-gorey/tested_tested/') for text in new_corpus.texts(u'json'): text.sentence_splitter() # print len(text.sentences) for sentence in text.sentences: # print sentence.tokens[0].content # for token in sentence.tokens: # print token.pos, token.content, token.lex # print u'***************' # sentence.find_pp() # sentence.find_coordination() sentence.find_complimentizers() sentence.find_names() sentence.eliminate_pair_comma() # for token in sentence.tokens: # print token.pos, token.content # print u'***************' # print sentence.span_splitter() sentence.get_shared_tokens() # loop through all the spans 1 sentence.split_double_complimentizers( ) # loop through all the spans 2 for span in sentence.spans: # loop through all the spans 3 # decide whether span is inserted or embedded or neither span.type() # print span.tokens[0].content, span.embedded_type # split embedded span if it contains > 1 predicate sentence.split_embedded() # for span in sentence.spans: # print span.shared_tokens[0].content, span.tokens[0].content # walk through spans and join whenever possible sentence.restore_embedded() sentence.split_base() # for span in sentence.spans: # print span.shared_tokens[0].content, span.tokens[0].content, span.finite() sentence.restore_base() # for span in sentence.spans: # print span.shared_tokens[0].content, span.tokens[0].content, span.finite() for span in sentence.spans: span.get_boundaries() # print span.quasi_embedded, span.tokens[0].content text.write_clause_ann() text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/right/') t2 = time.time() print t2 - t1
from algorithms import Algorithm from visualization import Visualization import utilities logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)s %(levelname)s %(message)s', filename='scitext.log', filemode='w') ''' TODO: # ADD MERGE class and functionality with master config and then allow linek to all config classes ''' print('\n\nreading from the following configuration files: \n\n ', utilities.get_config('./config/data/master.yaml')['config_files']) corpus_list = [ Corpus(config_file) for config_file in utilities.get_config( './config/data/master.yaml')['config_files'] ] corpi = Merge([corpus() for corpus in corpus_list]) alg = Algorithm(corpi, './config/algorithms.yaml') alg_ran = alg.run() vis = Visualization('./config/visualization.yaml', './config/algorithms.yaml', alg_ran) vis.run()