예제 #1
0
파일: cli.py 프로젝트: chbrown/xdoc-python
def translate(parser):
    opts = parser.parse_args()
    logger = logging.getLogger(__name__)

    # read input
    input_root, input_extension = os.path.splitext(opts.input)
    if input_extension == '.docx':
        logger.info('Reading from filepath: %s', opts.input)

        from xdoc.formats.docx import read
        with open(opts.input) as fp:
            document = read(fp)
    else:
        raise NotImplementedError('File extension "%s" not supported as input' % input_extension)

    # write output
    output_root, output_extension = os.path.splitext(opts.output)
    if output_extension == '.tex':
        tex_filepath = output_root + '.tex'
        bib_filepath = output_root + '.bib'
        logger.info('Writing to filepaths: %s & %s', tex_filepath, bib_filepath)

        from xdoc.formats.tex import write
        with open(tex_filepath, 'w') as tex_fp:
            with open(bib_filepath, 'w') as bib_fp:
                write(tex_fp, bib_fp, document)
    else:
        raise NotImplementedError('File extension "%s" not supported as output' % output_extension)
예제 #2
0
파일: cli.py 프로젝트: chbrown/xdoc-python
def main():
    parser = argparse.ArgumentParser(
        description='Usage: xdoc original.docx converted.tex',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # parser.add_argument('input', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
    parser.add_argument('input', help='input filename')
    # parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
    parser.add_argument('output', help='output filename')
    parser.add_argument('-a', '--action', choices=actions, default='translate', help='xdoc action')
    parser.add_argument('-v', '--verbose', action='store_true', help='Log extra output')
    opts = parser.parse_args()

    logging.root.setLevel(logging.DEBUG if opts.verbose else logging.INFO)
    logger = logging.getLogger(__name__)
    logger.info('Logging with level >= %s (%s)', logging.root.level, logging.getLevelName(logging.root.level))

    actions[opts.action](parser)

    logger.debug('Done')
예제 #3
0
파일: cli.py 프로젝트: chbrown/xdoc-python
def parsebib(parser):
    opts = parser.parse_args()
    logger = logging.getLogger(__name__)

    from xdoc.bibliography import crossref_lookup
    from xdoc.formats.tex import serialize_reference

    input = sys.stdin if (opts.input == '-') else open(opts.input)
    output = sys.stdout if (opts.output == '-') else open(opts.output)

    for line in input:
        line = line.strip().decode('utf8')
        logger.info('Resolving "%s" via CrossRef API', line)

        for bibitem in crossref_lookup(line):
            print >> output, serialize_reference(bibitem)
            break
        else:
            logger.error('FIXME: could not parse bib item: %s', line)
예제 #4
0
파일: crf.py 프로젝트: chbrown/xdoc-python
# a lot of this is from Tweedr
import crfsuite
import re
from colorama import Fore
import tempfile
from xdoc.formats.tex import serialize_reference
from xdoc.lib.text import utf8str
from unidecode import unidecode
from viz import gloss
# from sklearn import linear_model, naive_bayes, neighbors, svm
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from xdoc.lib.log import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

xml_begin = re.compile('<(\w+)>')
xml_end = re.compile('</(\w+)>')
# tex_command = re.compile(r'\\[a-z]+\{([^\}]+)\}')


def mean(xs):
    return sum(xs) / float(len(xs))


class ItemSequence(crfsuite.ItemSequence):
    def __init__(self, features_iter, check=False):
        '''Create new ItemSequence, typedef std::vector<Item> based on the
        given iterable of iterable of 2-tuples or strings.
        If check=True, any unicode present in the given features_iter
        will be encoded into a bytestring as utf8.'''