示例#1
0
    collection = args.COLLECTION
    csvfile = args.OUTFILE
    idfile = args.IDFILE

    ptp = PairTreePathFinder(collection)
    lexicon = Dictionary.BuildLexicon()
   
    if not idfile:
        # default identifier in collection
        idfile = os.path.join(collection, 'id')

    scores = {}
    counter = Counter()
    with open(csvfile, 'w', encoding='utf-8') as csvf:
        csvwriter = csv.writer(csvf)
        for i, htid in enumerate(file_id_iter(idfile, 'r')):
            try:
                path, post = ptp.get_path_to_htid(htid)
            except ValueError as ve:
                print(ve)
                continue
            path = os.path.join(path, post + ".txt")
            try:
                with open(path, encoding='utf-8') as f:
                    text = f.readlines()

                _,_,_,lowcount,lowmatch,_ = AccEval.GetScore(text, lexicon)
                pct = lowmatch / lowcount * 100
                pct = round(pct, 1)
                scores[htid] = pct
                counter[pct] += 1
示例#2
0
        help="Output a JSON result file in addition the the default csv file.")
    parser.add_argument("--id-file", "-i", 
        metavar="ID_FILE", 
        dest="ID_FILE", 
        help="Analyze the ids contained in ID_FILE rather than the entire database.")

    args = parser.parse_args()

    if not os.path.exists(args.DATABASE):
        print "database {} does not exist".format(args.DATABASE)
        sys.exit()

    with MarcSQLite(args.DATABASE) as db:

        if args.ID_FILE:
            ids = file_id_iter(args.ID_FILE)
        else:
            ids = None

        if args.MAPPING == 'years':
            mapper = map_publication_years
        elif args.MAPPING == 'subjects':
            mapper = map_subjects

        map_onto_records(mapper, db, args.CSV_OUT, json_fname=args.JSON_OUT, 
            ids=ids)


        

示例#3
0
                        action='store_true',  
                        help='Overwrite existing collated documents.')
    parser.add_argument('--no-divs', 
                        action='store_true', 
                        help='If specified, do not write page or header divisions to the collation.')
    parser.add_argument('--skip', 
                        type=int,
                        default=0,
                        help='Number of lines in the id file to skip; eg after an interrupted collate.')
    args = parser.parse_args()
    
    collection = args.COLLECTION
    rewrite_existing = args.rewrite_existing
    include_divs = not args.no_divs
    id_file = args.ID_FILE

    if not id_file:
        # default identifier in collection
        id_file = os.path.join(collection, 'id')
        
    ids = file_id_iter(id_file)
    print(bigcollate(ids, collection, rewrite_existing=rewrite_existing, 
        include_divs=include_divs, skip=args.skip))



os.path.join(os.path.expanduser("~"), ".collate_resume")