Exemplo n.º 1
0
    def __call__(self, doc: Doc):
        save_parsed = doc.is_parsed
        doc.is_parsed = False
        if self.split_matcher:
            matches = self.split_matcher(doc)
            for match_id, start, end in matches:
                token = doc[end - 1]
                token.is_sent_start = True
                if end - 2 >= 0 and doc[end - 2].is_sent_start is True:
                    doc[end - 2].is_sent_start = False
        if self.join_matcher:
            matches = self.join_matcher(doc)
            for match_id, start, end in matches:
                # If there is a sent start in the match, just remove it
                for token in doc[start:end]:
                    if token.is_sent_start:
                        token.is_sent_start = False
        if doc.is_sentenced:
            # Trim starting spaces
            sent_start = None
            for sent in doc.sents:
                sentlen = len(sent)
                first_non_space = 0
                while first_non_space < sentlen and sent[
                        first_non_space].is_space:
                    first_non_space += 1
                if first_non_space > 0 and first_non_space < sentlen:
                    sent[0].is_sent_start = False
                    sent[first_non_space].is_sent_start = True

        doc.is_parsed = save_parsed if doc.is_sentenced else True
        return doc
Exemplo n.º 2
0
 def __call__(self, doc : Doc):
     save_parsed = doc.is_parsed
     doc.is_parsed = False
     if self.split_matcher:
         matches = self.split_matcher(doc)
         for match_id, start, end in matches:
             token = doc[end-1]
             token.is_sent_start = True
             if end-2>=0 and doc[end-2].is_sent_start is True:
                 doc[end-2].is_sent_start = False
     if self.join_matcher:
         matches = self.join_matcher(doc)
         for match_id, start, end in matches:
             # If there is a sent start in the match, just remove it
             for token in doc[start:end]:
                 if token.is_sent_start:
                     token.is_sent_start = False
     doc.is_parsed = save_parsed if doc.is_sentenced else True
     return doc
Exemplo n.º 3
0
def load_and_transform(batch_id, in_loc, out_dir):
    out_loc = path.join(out_dir, '%d.txt' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False)
    with io.open(out_loc, 'w', encoding='utf8') as out_file:
        with io.open(in_loc, 'rb') as in_file:
            for byte_string in Doc.read_bytes(in_file):
                doc = Doc(nlp.vocab).from_bytes(byte_string)
                doc.is_parsed = True
                out_file.write(transform_doc(doc)) 
Exemplo n.º 4
0
def load_and_transform(batch_id, in_loc, out_dir):
    out_loc = path.join(out_dir, '%d.txt' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False)
    with io.open(out_loc, 'w', encoding='utf8') as out_file:
        with io.open(in_loc, 'rb') as in_file:
            for byte_string in Doc.read_bytes(in_file):
                doc = Doc(nlp.vocab).from_bytes(byte_string)
                doc.is_parsed = True
                out_file.write(transform_doc(doc))