예제 #1
0
def grids_from_text(ldc_desc, args):
    """Extract grids for documents in a corpus (already parsed)"""
    t0 = time()
    try:
        input_path = '{0}/trees/{1}'.format(args.workspace, ldc_desc['name'])
        output_path = '{0}/grids/{1}'.format(args.workspace, ldc_desc['name'])
        logging.info('processing: %s', input_path)
        if not args.dry_run:
            with gzip.open(input_path + '.gz', 'rb') as fi:
                with gzip.open(output_path + '.gz', 'wb') as fo:
                    for lines, attrs in iterdoctext(fi):
                        logging.debug('document %s', attrs['id'])
                        cmd_line = args.ExtractGrid
                        cmd_args = shlex.split(cmd_line)
                        proc = subprocess.Popen(cmd_args,
                                                stdin=subprocess.PIPE,
                                                stdout=subprocess.PIPE)
                        (stdoutdata, stderrdata) = proc.communicate(
                            '{0}\n'.format('\n'.join(lines)))
                        writedoctext(fo,
                                     stdoutdata.split('\n'),
                                     id=attrs['id'])
        logging.info('done: %s', output_path)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))

        #print >> sys.stderr, ptb_str
        #print ' '.join(Tree(ptb_str).leaves())
        #print
    return time() - t0
예제 #2
0
def grids_from_text(ldc_desc, args):
    """Extract grids for documents in a corpus (already parsed)"""
    t0 = time()
    try:
        input_path = '{0}/trees/{1}'.format(args.workspace, ldc_desc['name'])
        output_path = '{0}/grids/{1}'.format(args.workspace, ldc_desc['name'])
        logging.info('processing: %s', input_path)
        if not args.dry_run:
            with gzip.open(input_path + '.gz', 'rb') as fi:
                with gzip.open(output_path + '.gz', 'wb') as fo:
                    for lines, attrs in iterdoctext(fi):
                        logging.debug('document %s', attrs['id'])
                        cmd_line = args.ExtractGrid
                        cmd_args = shlex.split(cmd_line)
                        proc = subprocess.Popen(cmd_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
                        (stdoutdata, stderrdata) = proc.communicate('{0}\n'.format('\n'.join(lines)))
                        writedoctext(fo, stdoutdata.split('\n'), id=attrs['id']) 
        logging.info('done: %s', output_path)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))

                #print >> sys.stderr, ptb_str
                #print ' '.join(Tree(ptb_str).leaves())
                #print
    return time() - t0
예제 #3
0
def main(args):
    """ Extract entities and construct grid """
    try:
        #for ipath in enumerate(ipaths): 
        #with gzip.open(input_path, 'rb') as fi:

        #with gzip.open(input_path+'_grid' + '.gz', 'wb') as fo:

        with open(args.directory, 'rb' ) as fi, \
         open(args.directory+'_grid', 'w') as fo:
            text_idx = 0
            grids = []
            for lines, attrs in iterdoctext(fi):
                logging.debug('document %s', attrs['id'])
                print ' extract '+str(len(lines))+' lines'

                print >> fo, "# docid=" + attrs['id']
                print >> fo, "# id=" + text_idx

                entities, sent_num = extract_grids(lines)
                print entities
                
                grid = construct_grid(entities, sent_num)
                grids.append(grid)
                print grid

                output_grid(grid, fo)
                #writedoctext(fo, grid , id=attrs['id'])
                text_idx+=1
            logging.info('done: %s', args.directory)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))       
예제 #4
0
def extract_grids(fi):
    """ Identify entities from ptb trees for document. store in dictionary for grid construction. """
    idx = 0
    entities = defaultdict(lambda : defaultdict(dict))
    #print 'fi='+fi
    for lines, attrs in iterdoctext(fi):
        logging.debug('document %s', attrs['docid'])
        print ' extract '+str(len(lines))+' lines'
        #for line in lines:
        entities, idx =  (convert_tree(line, entities) for line in lines)
        
    return entities, idx        
예제 #5
0
def extract_grids(fi):
    """ Identify entities from ptb trees for document. store in dictionary for grid construction. """
    idx = 0
    entities = defaultdict(lambda: defaultdict(dict))
    #print 'fi='+fi
    for lines, attrs in iterdoctext(fi):
        logging.debug('document %s', attrs['docid'])
        print ' extract ' + str(len(lines)) + ' lines'
        #for line in lines:
        entities, idx = (convert_tree(line, entities) for line in lines)

    return entities, idx
예제 #6
0
def main(args):
    """ Extract documents and shuffle sentences within each document """
    try:
        fi = open(args.directory, 'r')
        with open('{0}.shuffled'.format(args.directory), 'w') as fo:
            for lines, attributes in iterdoctext(fi):
                random.shuffle(lines)
                logging.debug('shuffled: %s', lines)
                writedoctext(fo, lines, **attributes)
            logging.info('done: %s', args.directory)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))       
예제 #7
0
def main(args):
    """ Extract documents and shuffle sentences within each document """
    try:
        fi = open(args.directory, 'r')
        with open('{0}.shuffled'.format(args.directory), 'w') as fo:
            for lines, attributes in iterdoctext(fi):
                random.shuffle(lines)
                logging.debug('shuffled: %s', lines)
                writedoctext(fo, lines, **attributes)
            logging.info('done: %s', args.directory)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
예제 #8
0
def main(args):
    # reads in documents
    for trees, attrs in iterdoctext(args.input):
        # generator of d-sequences
        sequences = (dseqs(tree,
                           depth=args.depth,
                           no_punc=not args.punc,
                           lexicalised=args.lexicalised,
                           child_phrase=args.child,
                           backoff=['*']) for tree in trees)
        # writes d-sequences
        writedoctext(args.output,
                     (' '.join(patterns) for patterns in sequences), **attrs)
예제 #9
0
def main(args):
    # reads in documents
    for trees, attrs in iterdoctext(args.input):
        # generator of d-sequences
        sequences = (dseqs(tree, 
                        depth=args.depth, 
                        no_punc=not args.punc, 
                        lexicalised=args.lexicalised, 
                        child_phrase=args.child, 
                        backoff=['*']) 
                    for tree in trees)
        # writes d-sequences
        writedoctext(args.output, 
                (' '.join(patterns) for patterns in sequences),
                **attrs)
예제 #10
0
def main(args):
    """ Extract documents and output each document to separate file"""
    try:
        fi = open(args.directory, 'r')
        idx = 0
        for lines, attributes in iterdoctext(fi):
            with open('{0}.{1}'.format(args.directory, idx), 'w') as fo:

                logging.debug('done: %s', args.directory)
                logging.debug('doc: %s', lines)
                for line in lines:
                    print >> fo, line
                idx += 1
            logging.info('done: %s', args.directory)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
예제 #11
0
def main(args):
    """ Extract documents and output each document to separate file"""
    try:
        fi = open(args.directory, 'r')
        idx = 0
        for lines, attributes in iterdoctext(fi):
            with open('{0}.{1}'.format(args.directory, idx), 'w') as fo:
            
                logging.debug('done: %s', args.directory)
                logging.debug('doc: %s', lines)
                for line in lines:
                    print >> fo, line
                idx+=1
            logging.info('done: %s', args.directory)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))       
예제 #12
0
def main(args):
    """
    Converts doctext to good SGML
    Arguments
    ---------
    argparse's args
    """
    from discourse.doctext import iterdoctext
    from discourse.docsgml import MakeSGMLDocs
    import sys

    sgmler = MakeSGMLDocs()
    [
        sgmler.add_doc(content, **attrs)
        for content, attrs in iterdoctext(args.input)
    ]
    sgmler.write(args.output)
예제 #13
0
def main(args):
    logging.basicConfig(level=logging.INFO, format='%(levelname)s %(message)s')

    # reads docs from input
    docs = list(iterdoctext(args.input))

    # distributes the jobs
    pool = Pool(args.jobs)
    logging.info('Distributing %d jobs to %d workers', len(docs), args.jobs)
    result = pool.map(partial(wrap_parse, args=args), docs)

    # stores the output
    times = []
    for (content, attrs), (trees, dt) in itertools.izip(docs, result):
        writedoctext(args.output, trees, **attrs)
        times.append(dt)

    # dumps a summary
    print >> sys.stderr, tabulate(enumerate(times), headers=['doc', 'time'], tablefmt='pipe')
예제 #14
0
def main(args):
    logging.basicConfig(level=logging.INFO, format='%(levelname)s %(message)s')

    # reads docs from input
    docs = list(iterdoctext(args.input))

    # distributes the jobs
    pool = Pool(args.jobs)
    logging.info('Distributing %d jobs to %d workers', len(docs), args.jobs)
    result = pool.map(partial(wrap_parse, args=args), docs)

    # stores the output
    times = []
    for (content, attrs), (trees, dt) in itertools.izip(docs, result):
        writedoctext(args.output, trees, **attrs)
        times.append(dt)

    # dumps a summary
    print >> sys.stderr, tabulate(enumerate(times), headers=['doc', 'time'], tablefmt='pipe')
예제 #15
0
        for path in glob('{0}/{1}*'.format(output_dir, corpus)))
    logging.info('%d files matching %s', len(done),
                 '{0}/{1}*'.format(output_dir, corpus))
    missing = todo - done
    return todo, done, missing


def wrap_dseqs((i, ipath, opath), depth, **kwargs):
    """
    Wrap a call to dseqs. To be used with Pool.map.
    """
    try:
        logging.info('(%d) %s ', i, ipath)
        fi = smart_open(ipath, 'r')
        fo = smart_open(opath, 'w')
        for trees, attrs in iterdoctext(fi):
            sequences = [
                ' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees
            ]
            writedoctext(fo, sequences, **attrs)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))


def extract_dseqs(corpus, args, namespace, **kwargs):
    """
    Extracts dsequences for a certain corpus
    """

    logging.info('Extracting d-sequences for: %s', corpus)
    input_dir = namespace.trees
예제 #16
0
def read_grids(istream, str2int):
    return [np.array([[str2int[role] for role in line] for line in lines], int) for lines, attrs in iterdoctext(istream)]
예제 #17
0
def read_alignments(istream):
    #return [np.array([[str2int[role] for role in line] for line in lines], int) for lines, attrs in iterdoctext(istream)]
    return [
        np.array([[alignment for alignment in line] for line in lines], int)
        for lines, attrs in iterdoctext(istream)
    ]
예제 #18
0
def read_grids(istream, str2int):
    return [
        np.array([[str2int[role] for role in line] for line in lines], int)
        for lines, attrs in iterdoctext(istream)
    ]
예제 #19
0
    todo = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(input_dir, corpus)))
    logging.info('%d files matching %s', len(todo), '{0}/{1}*'.format(input_dir, corpus))
    done = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(output_dir, corpus)))
    logging.info('%d files matching %s', len(done), '{0}/{1}*'.format(output_dir, corpus))
    missing = todo - done
    return todo, done, missing

def wrap_dseqs((i, ipath, opath), depth, **kwargs):
    """
    Wrap a call to dseqs. To be used with Pool.map.
    """
    try:
        logging.info('(%d) %s ', i, ipath)
        fi = smart_open(ipath, 'r')
        fo = smart_open(opath, 'w')
        for trees, attrs in iterdoctext(fi):
            sequences = [' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees]
            writedoctext(fo, sequences, **attrs)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))

def extract_dseqs(corpus, args, namespace, **kwargs):
    """
    Extracts dsequences for a certain corpus
    """

    logging.info('Extracting d-sequences for: %s', corpus)
    input_dir = namespace.trees
    output_dir = namespace.dseqs

    todo, done, missing = file_check(corpus, input_dir, output_dir)
def read_alignments(istream):
    #return [np.array([[str2int[role] for role in line] for line in lines], int) for lines, attrs in iterdoctext(istream)]
    return [np.array([[ alignment for alignment in line] for line in lines], int) for lines, attrs in iterdoctext(istream)]