Пример #1
0
def article_path(sep_dir):
    if pending(sep_dir):
        corpus_root = config.get('corpus', 'edit_path')
        path = os.path.join(corpus_root, sep_dir, 'index.html')
    elif published(sep_dir):
        corpus_root = config.get('corpus', 'path')
        path = os.path.join(corpus_root, sep_dir, 'index.html')
    elif copy_edit(sep_dir):
        corpus_root = config.get('corpus', 'edit_path')
        path = os.path.join(corpus_root, sep_dir, 'index.html')
        logging.info('Processing unpublished article ' + sep_dir)
    else:
        path = ''

    return path
Пример #2
0
def article_path(sep_dir):
    if pending(sep_dir):
        corpus_root = config.get('corpus', 'edit_path')
        path = os.path.join(corpus_root, sep_dir, 'index.html')
    elif published(sep_dir):
        corpus_root = config.get('corpus', 'path')
        path = os.path.join(corpus_root, sep_dir, 'index.html')
    elif copy_edit(sep_dir):
        corpus_root = config.get('corpus', 'edit_path')
        path = os.path.join(corpus_root, sep_dir, 'index.html')
        logging.info('Processing unpublished article ' + sep_dir)
    else:
        path = ''

    return path
Пример #3
0
def new_entries():
    """
    Returns a list of all entries which do not have a corresponding InPhO Entity.
    """

    # get list of all entries in database
    sep_dirs = Session.query(Entity.sep_dir).filter(Entity.sep_dir != '').all()
    sep_dirs = [row[0] for row in sep_dirs]

    # get list of all entries in the SEP database
    entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt')

    # build list of new entries
    new_sep_dirs = []
    with open(entries) as f:
        for line in f:
            sep_dir = line.split('::', 1)[0]
            try:
                if sep_dir not in sep_dirs and copy_edit(sep_dir):
                    # published entry not in database, add to list of entries
                    new_sep_dirs.append(sep_dir)
            except IOError:
                # skip IOErrors, as these indicate potential entries w/o logs
                continue

    # remove the sample entry
    try:
        new_sep_dirs.remove('sample')
    except ValueError:
        pass

    return new_sep_dirs
Пример #4
0
def apriori(input_filename='output.txt', output_filename='edges.txt'):
    apriori_bin = config.get('corpus', 'apriori_bin')
    args = [
        apriori_bin, input_filename, output_filename, '0.00000000000000001',
        '0.00000000000000001'
    ]
    return subprocess.call(args)
Пример #5
0
def pending(sep_dir, db_root=None):
    """
    Checks if the given article is in the list of 
    pending publications.
    """
    if db_root is None:
        db_root = config.get('corpus', 'db_path')
Пример #6
0
def new_entries():
    """
    Returns a list of all entries which do not have a corresponding InPhO Entity.
    """

    # get list of all entries in database
    sep_dirs = Session.query(Entity.sep_dir).filter(Entity.sep_dir!='').all()
    sep_dirs = [row[0] for row in sep_dirs]

    # get list of all entries in the SEP database
    entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt')

    # build list of new entries
    new_sep_dirs = []
    with open(entries) as f:
        for line in f:
            sep_dir = line.split('::', 1)[0]
            try:
                if sep_dir not in sep_dirs and copy_edit(sep_dir):
                    # published entry not in database, add to list of entries
                    new_sep_dirs.append(sep_dir)
            except IOError:
                # skip IOErrors, as these indicate potential entries w/o logs
                continue

    # remove the sample entry
    try:
        new_sep_dirs.remove('sample')
    except ValueError:
        pass

    return new_sep_dirs
Пример #7
0
def get_title(sep_dir):
    """
    Returns the title for the given sep_dir
    """
    entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt')
    
    with open(entries) as f:
        for line in f:
            dir, title, rest = line.split('::', 2)
            if dir == sep_dir:
                return title.replace(r"\'", "'")

    raise KeyError("Invalid sep_dir")
Пример #8
0
def fuzzymatch_new():
    """
    Writes the fuzzymatch data to the cache specified in the config file.
    """
    fuzzy_path = config.get('corpus', 'fuzzy_path')
    titles = get_titles()
    for entry in new_entries():
        print entry
        matches = fuzzymatch(titles[entry])
        with open(os.path.join(fuzzy_path, entry), 'wb') as f:
            writer = csv.writer(f)
            for match, prob in matches:
                writer.writerow([match.ID, match.label, prob])
Пример #9
0
def get_title(sep_dir):
    """
    Returns the title for the given sep_dir
    """
    entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt')

    with open(entries) as f:
        for line in f:
            dir, title, rest = line.split('::', 2)
            if dir == sep_dir:
                return title.replace(r"\'", "'")

    raise KeyError("Invalid sep_dir")
Пример #10
0
def single_fuzz(entry):
    fuzzy_path = config.get('corpus', 'fuzzy_path')
    titles = get_titles()
    print titles[entry]
    if '&#' in titles[entry]:
        print unescape(titles[entry])
        matches = fuzzymatch(unescape(titles[entry]).decode('utf8'))
    else:
        matches = fuzzymatch(titles[entry])
    with open(os.path.join(fuzzy_path, entry), 'wb') as f:
        writer = csv.writer(f)
        for match, prob in matches:
            writer.writerow([match.ID, unidecode(match.label), prob])
Пример #11
0
def get_related():
    """
    Returns a dictionary of { sep_dir : related } pairs.
    """
    entries = os.path.join(config.get('corpus', 'db_path'), 'related_entries.txt')
    
    related = {}
    with open(entries) as f:
        f.readline()
        for line in f:
            sep_dir, rest = line.split('::', 1)
            related[sep_dir] = rest.split('|')

    return related
Пример #12
0
def get_titles():
    """
    Returns a dictionary of { sep_dir : title } pairs.
    """
    entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt')

    titles = {}
    with open(entries) as f:
        for line in f:
            sep_dir, title, rest = line.split('::', 2)
            title = title.replace(r"\'", "'")
            titles[sep_dir] = title

    return titles
Пример #13
0
def get_titles():
    """
    Returns a dictionary of { sep_dir : title } pairs.
    """
    entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt')
    
    titles = {}
    with open(entries) as f:
        for line in f:
            sep_dir, title, rest = line.split('::', 2)
            title = title.replace(r"\'", "'")
            titles[sep_dir] = title

    return titles
Пример #14
0
def pending(sep_dir, db_root=None):
    """
    Checks if the given article is in the list of 
    pending publications.
    """
    if db_root is None:
        db_root = config.get('corpus', 'db_path')

    pubpending = os.path.join(db_root, 'pubpending.txt')
    with open(pubpending) as db:
        for line in db:
            if sep_dir == line.strip():
                return True

    return False
Пример #15
0
def get_categories():
    """
    Returns a dictionary of { sep_dir : title } pairs.
    """
    entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt')
    
    categories = {}
    with open(entries) as f:
        for line in f:
            sep_dir, rest = line.split('::', 1)
            category = line.split('::')[-3]
            category = category.replace(r"\'", "'")
            categories[sep_dir] = category

    return categories 
Пример #16
0
def pending(sep_dir, db_root=None):
    """
    Checks if the given article is in the list of 
    pending publications.
    """
    if db_root is None:
        db_root = config.get('corpus', 'db_path')

    pubpending = os.path.join(db_root, 'pubpending.txt')
    with open(pubpending) as db:
        for line in db:
            if sep_dir == line.strip():
                return True

    return False
Пример #17
0
def get_categories():
    """
    Returns a dictionary of { sep_dir : title } pairs.
    """
    entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt')

    categories = {}
    with open(entries) as f:
        for line in f:
            sep_dir, rest = line.split('::', 1)
            category = line.split('::')[-3]
            category = category.replace(r"\'", "'")
            categories[sep_dir] = category

    return categories
Пример #18
0
def get_related():
    """
    Returns a dictionary of { sep_dir : related } pairs.
    """
    entries = os.path.join(config.get('corpus', 'db_path'),
                           'related_entries.txt')

    related = {}
    with open(entries) as f:
        f.readline()
        for line in f:
            sep_dir, rest = line.split('::', 1)
            related[sep_dir] = rest.split('|')

    return related
Пример #19
0
def get_status_code(sep_dir, code, log_root=None):
    """
    Checks if the given article has the given status code.
    """
    if log_root is None:
        log_root = config.get('corpus', 'log_path')

    log_path = os.path.join(log_root, sep_dir)
    if os.path.exists(log_path):
        with open(log_path) as log:
            for line in log:
                #use the published flag 
                if ('::' + code) in line:
                    return True

    return False
Пример #20
0
def get_status_code(sep_dir, code, log_root=None):
    """
    Checks if the given article has the given status code.
    """
    if log_root is None:
        log_root = config.get('corpus', 'log_path')

    log_path = os.path.join(log_root, sep_dir)
    if os.path.exists(log_path):
        with open(log_path) as log:
            for line in log:
                #use the published flag
                if ('::' + code) in line:
                    return True

    return False
Пример #21
0
def fuzzymatch_new():
    """
    Writes the fuzzymatch data to the cache specified in the config file.
    """
    fuzzy_path = config.get('corpus', 'fuzzy_path')
    titles = get_titles()
    for entry in new_entries():
        print entry
        if '&#' in titles[entry]:
            print unescape(titles[entry])
            matches = fuzzymatch(unescape(titles[entry]).decode('utf8'))
        else:
            matches = fuzzymatch(titles[entry])
        with open(os.path.join(fuzzy_path, entry), 'wb') as f:
            writer = csv.writer(f)
            for match, prob in matches:
                writer.writerow([match.ID, unidecode(match.label), prob])
Пример #22
0
def fuzzymatch(string1, string2):
    """
    Takes two strings and performs a fuzzymatch on them. 
    Returns a (confidence, distance) tuple.
    """
    php = PHP("""set_include_path('%(lib_path)s'); 
                 require 'fuzzymatch.php';""" %
              {'lib_path': config.get('general', 'lib_path')})

    code = '$string1 = utf8_decode("' + string1.encode('utf8') + '");'
    code += '$string2 = utf8_decode("' + string2.encode('utf8') + '");'
    code += 'print fuzzy_match($string1, $string2, 2);'

    result = php.get_raw(code)
    confidence, distance = map(float, result.split(','))

    return (confidence, distance)
Пример #23
0
def fuzzymatch(string1, string2):
    """
    Takes two strings and performs a fuzzymatch on them. 
    Returns a (confidence, distance) tuple.
    """
    php = PHP("""set_include_path('%(lib_path)s'); 
                 require 'fuzzymatch.php';""" % 
                 {'lib_path': config.get('general', 'lib_path')})


    code = '$string1 = utf8_decode("' + string1.encode('utf8') + '");'
    code += '$string2 = utf8_decode("' + string2.encode('utf8') + '");'
    code += 'print fuzzy_match($string1, $string2, 2);'

    result = php.get_raw(code)
    confidence,distance = map(float, result.split(','))

    return (confidence, distance)
Пример #24
0
    def new(self):
        """ Form for creating a new entry """
        if not h.auth.is_logged_in():
            response.status_int = 401
            return "Unauthorized"
        if not h.auth.is_admin():
            response.status_int = 403
            return "Forbidden"

        # initialize template variables
        c.message = ""
        c.label = request.params.get('label', None)
        c.sep_dir = request.params.get('sep_dir', None)

        c.linklist = []
       
        if c.sep_dir and not c.label:
            try:
                c.label = sep.get_title(c.sep_dir)
            except KeyError:
                c.message = "Invalid sep_dir: " + c.sep_dir
                c.sep_dir = ""

        if c.sep_dir:
            fuzzypath = config.get('corpus', 'fuzzy_path')
            fuzzypath = os.path.join(fuzzypath, c.sep_dir)
            if os.path.exists(fuzzypath):
                with open(fuzzypath) as f:
                    matches = csv.reader(f)
                    for row in matches:
                        c.linklist.append(row)
            else:
                c.message = "Fuzzy match for " + c.sep_dir + " not yet complete."

            c.linklist.sort(key=lambda x: x[2], reverse=True)

        return render('entity/new.html')
Пример #25
0
    def new(self):
        """ Form for creating a new entry """
        if not h.auth.is_logged_in():
            response.status_int = 401
            return "Unauthorized"
        if not h.auth.is_admin():
            response.status_int = 403
            return "Forbidden"

        # initialize template variables
        c.message = ""
        c.label = request.params.get("label", None)
        c.sep_dir = request.params.get("sep_dir", None)

        c.linklist = []

        if c.sep_dir and not c.label:
            try:
                c.label = sep.get_title(c.sep_dir)
            except KeyError:
                c.message = "Invalid sep_dir: " + c.sep_dir
                c.sep_dir = ""

        if c.sep_dir:
            fuzzypath = config.get("corpus", "fuzzy_path")
            fuzzypath = os.path.join(fuzzypath, c.sep_dir)
            if os.path.exists(fuzzypath):
                with open(fuzzypath) as f:
                    matches = csv.reader(f)
                    for row in matches:
                        c.linklist.append(row)
            else:
                c.message = "Fuzzy match for " + c.sep_dir + " not yet complete."

            c.linklist.sort(key=lambda x: x[2], reverse=True)

        return render("entity/new.html")
Пример #26
0
        log.write('Pulled: ' + str(rundate) + '\n')
    
def backup(data_path, backup_path, rundate):
    backup_path = os.path.join(backup_path, rundate + '.tar.gz')
    tar = tarfile.open(backup_path, 'w:gz')
    tar.add(data_path, arcname=os.path.basename(data_path))
    tar.close()
        
    

#start date 1970-01-01 for the first pull
#currently incremented manually for subsequent pulls, but should read log file
if __name__ == '__main__':
    import sys

    log_path = config.get('general', 'log_path')
    philpapers_log = os.path.join(log_path, 'philpapers')

    backup_path = config.get('general', 'backup_path')
    philpapers_backup_path = os.path.join(backup_path, 'philpapers')
    if not os.path.exists(philpapers_backup_path):
        os.mkdir(philpapers_backup_path)

    if os.path.exists(philpapers_log):
        with open(philpapers_log, 'r') as f:
            lineList = f.readlines()
            start_date = lineList[-1][8:].strip()
    else:
        start_date = "1970-01-01"

    if sys.argv[-1] != 'philpapers.py':
Пример #27
0
def pull_from_date(start_date, offset=0, data_path=None, log_path=None,
                   backup_path=None): 
    """
    Pulls and pickles all new or changed paper abstracts
    from start_date until there are no more and then records
    the date it ran.
    """
    # ensure data dir is proper
    if data_path is None:
        data_path = 'philpapers_data'

    if not os.path.isdir(data_path):
        os.makedirs(data_path)

    # ensure log dir is proper
    if log_path is None:
        log_path = 'philpapers_log'
    
    rundate = strftime("%Y-%m-%d")
    sys.stderr.write('Starting at ' + start_date + ' to ' + str(rundate) + '\n')

    if backup_path is not None:
        backup(data_path, backup_path, rundate)
    
    conn = httplib.HTTPConnection("philpapers.org")
    
    while(True):
        request_url= "/utils/export_contents.json?apiId=2956&apiKey=" +\
            config.get("philpapers", "apikey") +"&updated=" + str(start_date) + "T12:00:00&offset=" + str(offset)
        
        sys.stderr.write('requesting offset: ' + str(offset) + " from " + str(start_date) + " with offset " + str(offset) + '\n')
        conn.request("GET", request_url)
        result = conn.getresponse()
      
        data_pulled = result.read()
        
        try: # check for good data
            data = json.loads(data_pulled)
        except ValueError: # wait a bit and try again
            sleep(10) 
            conn.request("GET", request_url)
            result = conn.getresponse()
            data_pulled = result.read()
            try: # check new data
                data = json.loads(data_pulled)
            except ValueError: # two failures
                sys.stderr.write('terminated with ValueError on second attempt at offset' + str(offset) + '\n')
                break

        for entry in data['entries']:
            entry_file = os.path.join(data_path, entry['id'] + '.json')
            with open(entry_file, 'w') as f:
                f.write(json.dumps(entry))
                
        if not data['more']:
            sys.stderr.write('terminated successfully, no more data available\n')
            break

        offset += 100
        sleep(10)

    with open(log_path, 'a+') as log:
        log.write('Pulled: ' + str(rundate) + '\n')
Пример #28
0
#!/usr/bin/python
if __name__ == "__main__":
    from inpho.model import *
    from inpho import config
    import sys
    import os.path

    # initialize a list for people
    people = []
    
    # open people database and add each person to the people list
    people_db = os.path.join(config.get("corpus", "db_path"), "people.txt")
    with open(people_db) as f:
        for line in f:
            line = line.split("::")
            people.append(line)
    
    usernames = Session.query(User.username).all()
    for p in people:
        # skip incomplete entries
        if len(p) < 5:
            print "skipping %s" % p
            continue
    
        #gather data for user creation
        firstname, lastname, username, password, email = p[:5]
      
        # prepend "sep." to the username
        username = "******" % username
    
        print "importing %s" % username
Пример #29
0
def get_status_code(sep_dir, code, log_root=None):
    """
    Checks if the given article has the given status code.
    """
    if log_root is None:
        log_root = config.get('corpus', 'log_path')
Пример #30
0
def article_path(sep_dir):
    if pending(sep_dir):
        corpus_root = config.get('corpus', 'edit_path')
Пример #31
0
if __name__ == '__main__':
    from argparse import ArgumentParser
    import sys
    parser = ArgumentParser()
    parser.add_argument("-u", "--username", 
        help="HTRC username, default in inpho.config")
    parser.add_argument("-p", "--password", 
        help="HTRC password, default in inpho.config")
    parser.add_argument("-v", "--volumes", nargs='+', help="HTRC volume ids")
    parser.add_argument("--pages", nargs='+', help="HTRC page ids")
    parser.add_argument("--test", help="test with 2 volumes")
    parser.add_argument("file", nargs='?', help="input file of ids")
    parser.add_argument("-o", "--output", required=True, help="output directory")
    args = parser.parse_args()

    username = config.get("hathitrust", "username")
    password = config.get("hathitrust", "password")
   
    if args.test: 
        volumeIDs  = ["uc2.ark:/13960/t2q52tn56",
                      "uc2.ark:/13960/t2q52xv16"]
        pageIDs    = ["uc2.ark:/13960/t2q52tn56[1,2,3,4,5]",
                      "uc2.ark:/13960/t2q52xv16[33,12,3,4,55]"]
    elif args.file:
        with open(args.file) as IDfile:
            volumeIDs = [line.strip() for line in IDfile]
    else:
        volumeIDs = args.volumes
    
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
Пример #32
0
def apriori(input_filename='output.txt', output_filename='edges.txt'):
    apriori_bin = config.get('corpus', 'apriori_bin')
    args = [apriori_bin, input_filename, output_filename,
            '0.00000000000000001', '0.00000000000000001']
    return subprocess.call(args)