예제 #1
0
def handle_args():
    parser = argparse.ArgumentParser(
        description="Parse a QLD Members' Interests PDF to a database.")
    parser.add_argument('input', help='the PDF file to parse')
    parser.add_argument('--dropall',
                        action='store_true',
                        help='drop all tables before processing begins')
    return parser.parse_args()
예제 #2
0
def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-f',
        '--file',
        help='input pdf',
        default=
        '/Users/Dhruv/Downloads/Sample roll call vote PDF_multiple columns[2].pdf'
    )

    return parser.parse_args()
예제 #3
0
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input_file", help="complete location of the input pdf file", required=True)
    parser.add_argument("-d", "--destination_file", help="complete location where output csv file will be created", required=False)
    args = parser.parse_args()
    input_file = None
    output_file_location = None
    if args.input_file:
        input_file = args.input_file
    if args.destination_file:
        output_file_location = args.destination_file

    return input_file, output_file_location
예제 #4
0
def parse_arguments():
    """
    Parser.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--infile",
                        "-i",
                        default='/raid/antoloui/Master-thesis/Data/QA/Cisco_CCNA.pdf',
                        type=str,
                        help="The input file from which to extract text.",
    )
    parser.add_argument("--outdir",
                        "-o",
                        default=None,
                        type=str,
                        help="The output directory.",
    )
    arguments, _ = parser.parse_known_args()
    return arguments
예제 #5
0
class SkillFinder(Resource):
    parser = reqparse.RequestParser()
    parser.add_argument('skill',
                        type=str,
                        required=True,
                        help="This field cannot be blank")

    def post(self):
        print("reaching this stage")
        data = SkillFinder.parser.parse_args()
        skill = data['skill']
        data_from_file = parser.from_file('example.pdf')
        list_of_words = data_from_file['content'].split()
        count_for_skillset = 0
        for word in list_of_words:
            if word == skill:
                count_for_skillset = count_for_skillset + 1

        print(count_for_skillset)
예제 #6
0
def _build_parser():
    parser = ArgumentParser()
    parser.add_argument('--file',
                        type=str,
                        dest='filepath',
                        help='File to parse',
                        required=True)
    parser.add_argument('--language',
                        type=str,
                        dest='language',
                        help='Language of file. Default: English',
                        required=False,
                        default=DEFAULT_LANG)
    parser.add_argument('--outpath',
                        type=str,
                        dest='outpath',
                        help='Name of output file',
                        required=False,
                        default=DEFAULT_OUTFILE)
    return parser
예제 #7
0
        bulk(client, docs)


def main(args):
    docs = Docs(args.databasename, args.server, args.username, args.password,
                args.driver, args.doc_json, args.index_mapping,
                args.index_name)
    docs.gen_docs(args.sql_script)
    docs.indexing_files()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Creating elasticsearch documents and indexing them')
    parser.add_argument('--databasename',
                        default="Buddie-Search-Testing",
                        help='databasename')
    parser.add_argument('--server',
                        default='ontwikkel-db.database.windows.net',
                        help='azure sql server server')
    parser.add_argument('--username',
                        default="username",
                        help='azure sql server username')
    parser.add_argument('--password',
                        default='....',
                        help='azure sql server password')
    parser.add_argument('--driver',
                        default='{ODBC Driver 17 for SQL Server}',
                        help='driver for azure sql server')
    parser.add_argument('--doc_json',
                        default='docs.jsonl',
예제 #8
0
    raw = parser.from_file(path)

    #return raw["metadata"]
    return raw["content"]


def read_pdf(path, engine="pdfminer"):
    # type: (str, str) -> str

    try:
        func = {
            "pdfminer": _read_pdf_pdfminer,
            "tika": _read_pdf_tika,
        }[engine]
    except KeyError:
        raise ValueError("Engine {} doesn't exist".format(engine))

    return func(path)


if __name__ == "__main__":
    from argparse import ArgumentParser

    parser = ArgumentParser(
        description="Merge pdf files in directory into one file.")
    parser.add_argument("dir", help="input directory")
    parser.add_argument("out", help="output file path")
    args = parser.parse_args()

    join_pdfs_in_folder(args.dir, args.out)
def main(url):
    
    # Download data
    file=project0.fetchincidents(url)
    
    # Extract Data
    incidents = project0.extractincidents(file)
	
    # Create Dataase
    db = project0.createdb()
	
    # Insert Data
    project0.populatedb(incidents)
	
    # Print Status
    project0.status()


if __name__ == '__main__':
#Initialize parser to parse URL into main function:
    parser = argparse.ArgumentParser()
#Specifying start of string which should be parsed into the .py function
    parser.add_argument("--incidents", type=str, required=True, 
                         help="The arrest summary url.")
#Check to see if url contains the fields necessary for the program:
    args = parser.parse_args()
    if args.incidents:
        main(args.incidents)


예제 #10
0
파일: scraper.py 프로젝트: blues-lab/polipy
def _parse_args():
    parser = argparse.ArgumentParser(description='Download privacy policies, optionally update the DB')
    parser.add_argument('input_path', help='Path to file where policy urls are located.')
    parser.add_argument('output_dir', help='Path to directory where policies will be saved. Creates directory structure <outputdir>/<date>/<regiontag>/<domain>/<urlhash>/')
    parser.add_argument('--processes', '-p', default=multiprocessing.cpu_count(), type=int, help='Number of processes to use')
    parser.add_argument('--check_previous', '-c', default=False, action='store_true', help='Boolean indicating whether to check against previous policies')
    parser.add_argument('--language', '-l', default='en-US, en', help='Language string to set in Firefox\'s intl.accept_languages option. Defaults to "en_US, en"')
    parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose logging')
    return parser.parse_args()
예제 #11
0
            f = codecs.open(os.path.join(outPath, file[:-4] + '.txt'),
                            'w',
                            encoding='utf8',
                            errors='ignore')
            f.write(parsed['content'])
            f.close()


if __name__ == '__main__':
    try:
        import argparse
        parser = argparse.ArgumentParser()
        parser.add_argument(
            '-config',
            help='If the parameters is extracted from config file. '
            'If True, then the command line parameters will be bypassed. '
            'If False, then user needs to pass parameters from command line.',
            type=bool,
            default=True)
        parser.add_argument(
            '-path', help='The path to the original document files folder.')
        parser.add_argument(
            '-o',
            '--output',
            help='The path to the extracted text output folder')
        args = parser.parse_args()
    except:
        args = None
        print('No arguments! using default path:')

    main(args)
예제 #12
0
def main():
    import argparse

    parser = argparse.ArgumentParser(
        description="""Extrae y guarda texto y metadata de archivos.""")

    parser.add_argument("dirin", help="Directorio de archivos originales.")
    parser.add_argument("dirout",
                        help="Directorio para almacenar texto extraido.")
    parser.add_argument(
        "--recursivo",
        default=False,
        action="store_true",
        help=
        "Visitar subdirectorios si se incluye. (%(default)s) Ej: --recursivo",
    )
    parser.add_argument(
        "--exts",
        action="append",
        required=False,
        help="Extraer solo de este tipo de archivo. Ej: --exts pdf --exts docx",
    )
    parser.add_argument(
        "--basura",
        action="append",
        help="Eliminar estos caracteres. Ej: --basura '<>!#' --basura � ",
    )
    parser.add_argument(
        "--chars",
        default=0,
        type=int,
        help=
        "Eliminar texto con pocos caracteres. (%(default)s). Ej: --chars 10",
    )

    args = parser.parse_args()

    dirin = args.dirin
    dirout = Path(args.dirout).resolve()
    recursivo = args.recursivo
    exts = args.exts
    basura = args.basura
    chars = args.chars

    n = extraer_todos(dirin,
                      dirout,
                      recursivo=recursivo,
                      exts=exts,
                      basura=basura,
                      chars=chars)
    print(f"{n} nuevos archivos guardados en carpeta {str(dirout)}")
예제 #13
0
    paragraphs = re.split('\n{2,}', text)
    paragraphs = [
        re.sub('[\n]', '', x) for x in paragraphs if re.search('[ก-์]{10}', x)
    ]
    sentences = [x for p in paragraphs for x in pythainlp.sent_tokenize(p)]
    output_file = os.path.splitext(f)[0] + '.sent'
    with open(output_file, mode='w') as out:
        for s in sentences:
            #out.write(' '.join(pythainlp.tokenize.word_tokenize(s)))
            out.write(s)
            out.write('\n')


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--en_dir', type=str)
    parser.add_argument('--th_dir', type=str)
    args = parser.parse_args()

    pdf_ens = glob.glob(f'{args.en_dir}*.pdf')
    pdf_ths = glob.glob(f'{args.th_dir}*.pdf')
    print(
        f'There are {len(pdf_ens)} en documents and {len(pdf_ths)} th documents'
    )

    #pdf2text
    for pdf_th in tqdm.tqdm(pdf_ths):
        pdf2text_th(pdf_th)
    for pdf_en in tqdm.tqdm(pdf_ens):
        pdf2text_en(pdf_en)
예제 #14
0
def initArgparse() -> ArgumentParser:
    parser = ArgumentParser(
        description=
        "A directory tree metadata parser using Apache Tika, by default it runs arguments: -d, -f, -m, -s",
    )
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version=f"{parser.prog} version {VERSION}",
    )
    parser.add_argument("DIRECTORY",
                        type=Path,
                        default=".",
                        nargs="+",
                        help="directory(s) to parse")
    parser.add_argument("-d",
                        "--directorytree",
                        action="store_true",
                        help="create directory tree")
    parser.add_argument(
        "-e",
        "--exclude",
        nargs="+",
        help="directory(s) to exclude, includes subdirectories",
    )
    parser.add_argument("-f",
                        "--filetree",
                        action="store_true",
                        help="creates a json and csv file tree")
    parser.add_argument("-m",
                        "--metadata",
                        action="store_true",
                        help="parse metadata")
    parser.add_argument(
        "-nm",
        "--newmetadata",
        action="store_true",
        help="create individual metadata files in a 'tikatree' directory",
    )
    parser.add_argument("-s",
                        "--sfv",
                        action="store_true",
                        help="create sfv file")
    parser.add_argument("-y",
                        "--yes",
                        action="store_true",
                        help="automatically overwrite older files")
    return parser
예제 #15
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
import tika
from pyltp import Segmentor
from tika import parser
import argparse
import Levenshtein

parser = argparse.ArgumentParser(
    description="Process the patient records in outpaitent service")
parser.add_argument("-d",
                    "--data",
                    type=str,
                    required=True,
                    help="Specify the data directory")
parser.add_argument("-f",
                    "--feature",
                    type=str,
                    required=True,
                    help="Specify the important keys needed extracted")
parser.add_argument("-o",
                    "--output",
                    type=str,
                    required=True,
                    help="Specify the results directory")
parser.add_argument("-dict",
                    "--dictionary",
                    type=str,
                    required=True,
예제 #16
0
def generate_text(markov_chain, words):
    state = get_random_state(markov_chain)
    text = state.split()[:words]
    while len(text) < words:
        state = get_next_state(markov_chain, state)
        if state is None:
            state = get_random_state(markov_chain)
        text.append(state.split()[-1])
    return ' '.join(text)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Markov Chain Text Generator')
    parser.add_argument('-f',
                        '--file',
                        required=True,
                        help='Name of file to read text from.')
    parser.add_argument('-o',
                        '--order',
                        default=1,
                        type=int,
                        help='Number of past states each state depends on.')
    parser.add_argument('-w',
                        '--words',
                        default=100,
                        type=int,
                        help='Number of words to generate.')
    pargs = parser.parse_args()

    tokens = tokenise_text_file(pargs.file)
    markov_chain = create_markov_chain(tokens, order=pargs.order)
예제 #17
0
        regexp = r'(https?://(?:dx\.)?doi\.org/[0-9]{2}\.[0-9]{4,6}/\S*)'
    elif args.what == 'url' or args.what == 'urls':
        regexp = r'(https?://\S*)'
    else:
        raise ValueError(
            'Unrecognised value of the `what` argument: {}'.format(args.what))

    matches = re.findall(regexp, raw_text['content'])
    # return the harvest, one entry per line
    matches = list(set(matches))
    print('\n'.join(matches))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Manage bib files.')
    parser.add_argument('--bibfile', type=str, default='')
    subparsers = parser.add_subparsers()
    # ---- parser for add reference command
    parser_add = subparsers.add_parser('add',
                                       help='Add reference to bibliography.')
    parser_add.add_argument('what', type=str)
    parser_add.add_argument('ids', nargs='*')
    parser_add.set_defaults(action=_add_reference)
    # ---- parser for print command
    parser_print = subparsers.add_parser(
        'print', help='Print to terminal the bibtex entry.')
    parser_print.add_argument('what', nargs='*')
    parser_print.add_argument('--where', type=str,
                              default='all')  # can be: doi, arxiv, all.
    parser_print.set_defaults(action=_print_reference)
    # parser_print.add_argument('--action', type=str, default='print')