Пример #1
0
def badsgml2text(ldc_name, args):
    path = '{0}/bsgml_trees/{1}'.format(args.workspace, ldc_name)
    logging.info('Processing %s', path)
    n_lines = []
    try:
        with open(path, 'r') as fi:
            with gzip.open('{0}/trees/{1}.gz'.format(args.workspace, ldc_name), 'wb') as fo:
                lines = fi.read().split('\n')
                doc_re = re.compile('<doc id="(.+)">')
                doc_id, doc_lines = None, None
                for line in lines:
                    # try to match <doc ...
                    m = doc_re.match(line)
                    if m is not None:
                        # starts a doc
                        doc_id = m.group(1)
                        doc_lines = []
                    # try to match </doc>
                    elif line == '</doc>':
                        # add the doc to an actual SGML file
                        n_lines.append(len(doc_lines))
                        writedoctext(fo, doc_lines, id=doc_id)
                        doc_lines = None
                        doc_id = None
                    # if there is an open doc, append lines to it
                    elif line and doc_lines is not None:
                        doc_lines.append(line)
                        #print >> sys.stderr, ptb_str
                        #print ' '.join(Tree(ptb_str).leaves())
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))

    return n_lines
Пример #2
0
def grids_from_text(ldc_desc, args):
    """Extract grids for documents in a corpus (already parsed)"""
    t0 = time()
    try:
        input_path = '{0}/trees/{1}'.format(args.workspace, ldc_desc['name'])
        output_path = '{0}/grids/{1}'.format(args.workspace, ldc_desc['name'])
        logging.info('processing: %s', input_path)
        if not args.dry_run:
            with gzip.open(input_path + '.gz', 'rb') as fi:
                with gzip.open(output_path + '.gz', 'wb') as fo:
                    for lines, attrs in iterdoctext(fi):
                        logging.debug('document %s', attrs['id'])
                        cmd_line = args.ExtractGrid
                        cmd_args = shlex.split(cmd_line)
                        proc = subprocess.Popen(cmd_args,
                                                stdin=subprocess.PIPE,
                                                stdout=subprocess.PIPE)
                        (stdoutdata, stderrdata) = proc.communicate(
                            '{0}\n'.format('\n'.join(lines)))
                        writedoctext(fo,
                                     stdoutdata.split('\n'),
                                     id=attrs['id'])
        logging.info('done: %s', output_path)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))

        #print >> sys.stderr, ptb_str
        #print ' '.join(Tree(ptb_str).leaves())
        #print
    return time() - t0
Пример #3
0
def grids_from_text(ldc_desc, args):
    """Extract grids for documents in a corpus (already parsed)"""
    t0 = time()
    try:
        input_path = '{0}/trees/{1}'.format(args.workspace, ldc_desc['name'])
        output_path = '{0}/grids/{1}'.format(args.workspace, ldc_desc['name'])
        logging.info('processing: %s', input_path)
        if not args.dry_run:
            with gzip.open(input_path + '.gz', 'rb') as fi:
                with gzip.open(output_path + '.gz', 'wb') as fo:
                    for lines, attrs in iterdoctext(fi):
                        logging.debug('document %s', attrs['id'])
                        cmd_line = args.ExtractGrid
                        cmd_args = shlex.split(cmd_line)
                        proc = subprocess.Popen(cmd_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
                        (stdoutdata, stderrdata) = proc.communicate('{0}\n'.format('\n'.join(lines)))
                        writedoctext(fo, stdoutdata.split('\n'), id=attrs['id']) 
        logging.info('done: %s', output_path)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))

                #print >> sys.stderr, ptb_str
                #print ' '.join(Tree(ptb_str).leaves())
                #print
    return time() - t0
Пример #4
0
def badsgml2text(ldc_name, args):
    path = '{0}/bsgml_trees/{1}'.format(args.workspace, ldc_name)
    logging.info('Processing %s', path)
    n_lines = []
    try:
        with open(path, 'r') as fi:
            with gzip.open('{0}/trees/{1}.gz'.format(args.workspace, ldc_name),
                           'wb') as fo:
                lines = fi.read().split('\n')
                doc_re = re.compile('<doc id="(.+)">')
                doc_id, doc_lines = None, None
                for line in lines:
                    # try to match <doc ...
                    m = doc_re.match(line)
                    if m is not None:
                        # starts a doc
                        doc_id = m.group(1)
                        doc_lines = []
                    # try to match </doc>
                    elif line == '</doc>':
                        # add the doc to an actual SGML file
                        n_lines.append(len(doc_lines))
                        writedoctext(fo, doc_lines, id=doc_id)
                        doc_lines = None
                        doc_id = None
                    # if there is an open doc, append lines to it
                    elif line and doc_lines is not None:
                        doc_lines.append(line)
                        #print >> sys.stderr, ptb_str
                        #print ' '.join(Tree(ptb_str).leaves())
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))

    return n_lines
Пример #5
0
def main(args):
    """ Extract documents and shuffle sentences within each document """
    try:
        fi = open(args.directory, 'r')
        with open('{0}.shuffled'.format(args.directory), 'w') as fo:
            for lines, attributes in iterdoctext(fi):
                random.shuffle(lines)
                logging.debug('shuffled: %s', lines)
                writedoctext(fo, lines, **attributes)
            logging.info('done: %s', args.directory)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))       
Пример #6
0
def main(args):
    """ Extract documents and shuffle sentences within each document """
    try:
        fi = open(args.directory, 'r')
        with open('{0}.shuffled'.format(args.directory), 'w') as fo:
            for lines, attributes in iterdoctext(fi):
                random.shuffle(lines)
                logging.debug('shuffled: %s', lines)
                writedoctext(fo, lines, **attributes)
            logging.info('done: %s', args.directory)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
Пример #7
0
def main(args):
    # reads in documents
    for trees, attrs in iterdoctext(args.input):
        # generator of d-sequences
        sequences = (dseqs(tree,
                           depth=args.depth,
                           no_punc=not args.punc,
                           lexicalised=args.lexicalised,
                           child_phrase=args.child,
                           backoff=['*']) for tree in trees)
        # writes d-sequences
        writedoctext(args.output,
                     (' '.join(patterns) for patterns in sequences), **attrs)
Пример #8
0
def main(args):
    # reads in documents
    for trees, attrs in iterdoctext(args.input):
        # generator of d-sequences
        sequences = (dseqs(tree, 
                        depth=args.depth, 
                        no_punc=not args.punc, 
                        lexicalised=args.lexicalised, 
                        child_phrase=args.child, 
                        backoff=['*']) 
                    for tree in trees)
        # writes d-sequences
        writedoctext(args.output, 
                (' '.join(patterns) for patterns in sequences),
                **attrs)
Пример #9
0
def extract_and_save_txt(sgml_gz, args):
    """Extracts documents from a gzipped sgml file -> file ids"""
    try:
        ids = []
        n = 0
        logging.info('Processing %s', sgml_gz)
        stem = get_ldc_name(sgml_gz)
        with gzip.open(sgml_gz, 'rb') as fi:
            with gzip.open('{0}/raw/{1}.gz'.format(args.workspace, stem), 'wb') as fo:
                parser = TextFromSGML(fi.read(), text_under='text', root='sgml')
                for doc in parser.iterdocs():
                    if doc['text']:
                        ids.append(doc['id'])
                        writedoctext(fo, doc['text'].split('\n'), id=doc['id'])
                logging.info('%s contains %d documents', stem, len(ids))
        return ids
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
Пример #10
0
def main(args):
    logging.basicConfig(level=logging.INFO, format='%(levelname)s %(message)s')

    # reads docs from input
    docs = list(iterdoctext(args.input))

    # distributes the jobs
    pool = Pool(args.jobs)
    logging.info('Distributing %d jobs to %d workers', len(docs), args.jobs)
    result = pool.map(partial(wrap_parse, args=args), docs)

    # stores the output
    times = []
    for (content, attrs), (trees, dt) in itertools.izip(docs, result):
        writedoctext(args.output, trees, **attrs)
        times.append(dt)

    # dumps a summary
    print >> sys.stderr, tabulate(enumerate(times), headers=['doc', 'time'], tablefmt='pipe')
Пример #11
0
def main(args):
    logging.basicConfig(level=logging.INFO, format='%(levelname)s %(message)s')

    # reads docs from input
    docs = list(iterdoctext(args.input))

    # distributes the jobs
    pool = Pool(args.jobs)
    logging.info('Distributing %d jobs to %d workers', len(docs), args.jobs)
    result = pool.map(partial(wrap_parse, args=args), docs)

    # stores the output
    times = []
    for (content, attrs), (trees, dt) in itertools.izip(docs, result):
        writedoctext(args.output, trees, **attrs)
        times.append(dt)

    # dumps a summary
    print >> sys.stderr, tabulate(enumerate(times), headers=['doc', 'time'], tablefmt='pipe')
Пример #12
0
def extract_and_save_txt(sgml_gz, args):
    """Extracts documents from a gzipped sgml file -> file ids"""
    try:
        ids = []
        n = 0
        logging.info('Processing %s', sgml_gz)
        stem = get_ldc_name(sgml_gz)
        with gzip.open(sgml_gz, 'rb') as fi:
            with gzip.open('{0}/raw/{1}.gz'.format(args.workspace, stem),
                           'wb') as fo:
                parser = TextFromSGML(fi.read(),
                                      text_under='text',
                                      root='sgml')
                for doc in parser.iterdocs():
                    if doc['text']:
                        ids.append(doc['id'])
                        writedoctext(fo, doc['text'].split('\n'), id=doc['id'])
                logging.info('%s contains %d documents', stem, len(ids))
        return ids
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
Пример #13
0
    return todo, done, missing


def wrap_dseqs((i, ipath, opath), depth, **kwargs):
    """
    Wrap a call to dseqs. To be used with Pool.map.
    """
    try:
        logging.info('(%d) %s ', i, ipath)
        fi = smart_open(ipath, 'r')
        fo = smart_open(opath, 'w')
        for trees, attrs in iterdoctext(fi):
            sequences = [
                ' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees
            ]
            writedoctext(fo, sequences, **attrs)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))


def extract_dseqs(corpus, args, namespace, **kwargs):
    """
    Extracts dsequences for a certain corpus
    """

    logging.info('Extracting d-sequences for: %s', corpus)
    input_dir = namespace.trees
    output_dir = namespace.dseqs

    todo, done, missing = file_check(corpus, input_dir, output_dir)
    if not missing:
Пример #14
0
    done = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(output_dir, corpus)))
    logging.info('%d files matching %s', len(done), '{0}/{1}*'.format(output_dir, corpus))
    missing = todo - done
    return todo, done, missing

def wrap_dseqs((i, ipath, opath), depth, **kwargs):
    """
    Wrap a call to dseqs. To be used with Pool.map.
    """
    try:
        logging.info('(%d) %s ', i, ipath)
        fi = smart_open(ipath, 'r')
        fo = smart_open(opath, 'w')
        for trees, attrs in iterdoctext(fi):
            sequences = [' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees]
            writedoctext(fo, sequences, **attrs)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))

def extract_dseqs(corpus, args, namespace, **kwargs):
    """
    Extracts dsequences for a certain corpus
    """

    logging.info('Extracting d-sequences for: %s', corpus)
    input_dir = namespace.trees
    output_dir = namespace.dseqs

    todo, done, missing = file_check(corpus, input_dir, output_dir)
    if not missing:
        logging.info('all d-sequences of depth %d are there, nothing to be done', args.depth)