def article_path(sep_dir): if pending(sep_dir): corpus_root = config.get('corpus', 'edit_path') path = os.path.join(corpus_root, sep_dir, 'index.html') elif published(sep_dir): corpus_root = config.get('corpus', 'path') path = os.path.join(corpus_root, sep_dir, 'index.html') elif copy_edit(sep_dir): corpus_root = config.get('corpus', 'edit_path') path = os.path.join(corpus_root, sep_dir, 'index.html') logging.info('Processing unpublished article ' + sep_dir) else: path = '' return path
def new_entries(): """ Returns a list of all entries which do not have a corresponding InPhO Entity. """ # get list of all entries in database sep_dirs = Session.query(Entity.sep_dir).filter(Entity.sep_dir != '').all() sep_dirs = [row[0] for row in sep_dirs] # get list of all entries in the SEP database entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt') # build list of new entries new_sep_dirs = [] with open(entries) as f: for line in f: sep_dir = line.split('::', 1)[0] try: if sep_dir not in sep_dirs and copy_edit(sep_dir): # published entry not in database, add to list of entries new_sep_dirs.append(sep_dir) except IOError: # skip IOErrors, as these indicate potential entries w/o logs continue # remove the sample entry try: new_sep_dirs.remove('sample') except ValueError: pass return new_sep_dirs
def apriori(input_filename='output.txt', output_filename='edges.txt'): apriori_bin = config.get('corpus', 'apriori_bin') args = [ apriori_bin, input_filename, output_filename, '0.00000000000000001', '0.00000000000000001' ] return subprocess.call(args)
def pending(sep_dir, db_root=None): """ Checks if the given article is in the list of pending publications. """ if db_root is None: db_root = config.get('corpus', 'db_path')
def new_entries(): """ Returns a list of all entries which do not have a corresponding InPhO Entity. """ # get list of all entries in database sep_dirs = Session.query(Entity.sep_dir).filter(Entity.sep_dir!='').all() sep_dirs = [row[0] for row in sep_dirs] # get list of all entries in the SEP database entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt') # build list of new entries new_sep_dirs = [] with open(entries) as f: for line in f: sep_dir = line.split('::', 1)[0] try: if sep_dir not in sep_dirs and copy_edit(sep_dir): # published entry not in database, add to list of entries new_sep_dirs.append(sep_dir) except IOError: # skip IOErrors, as these indicate potential entries w/o logs continue # remove the sample entry try: new_sep_dirs.remove('sample') except ValueError: pass return new_sep_dirs
def get_title(sep_dir): """ Returns the title for the given sep_dir """ entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt') with open(entries) as f: for line in f: dir, title, rest = line.split('::', 2) if dir == sep_dir: return title.replace(r"\'", "'") raise KeyError("Invalid sep_dir")
def fuzzymatch_new(): """ Writes the fuzzymatch data to the cache specified in the config file. """ fuzzy_path = config.get('corpus', 'fuzzy_path') titles = get_titles() for entry in new_entries(): print entry matches = fuzzymatch(titles[entry]) with open(os.path.join(fuzzy_path, entry), 'wb') as f: writer = csv.writer(f) for match, prob in matches: writer.writerow([match.ID, match.label, prob])
def single_fuzz(entry): fuzzy_path = config.get('corpus', 'fuzzy_path') titles = get_titles() print titles[entry] if '&#' in titles[entry]: print unescape(titles[entry]) matches = fuzzymatch(unescape(titles[entry]).decode('utf8')) else: matches = fuzzymatch(titles[entry]) with open(os.path.join(fuzzy_path, entry), 'wb') as f: writer = csv.writer(f) for match, prob in matches: writer.writerow([match.ID, unidecode(match.label), prob])
def get_related(): """ Returns a dictionary of { sep_dir : related } pairs. """ entries = os.path.join(config.get('corpus', 'db_path'), 'related_entries.txt') related = {} with open(entries) as f: f.readline() for line in f: sep_dir, rest = line.split('::', 1) related[sep_dir] = rest.split('|') return related
def get_titles(): """ Returns a dictionary of { sep_dir : title } pairs. """ entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt') titles = {} with open(entries) as f: for line in f: sep_dir, title, rest = line.split('::', 2) title = title.replace(r"\'", "'") titles[sep_dir] = title return titles
def pending(sep_dir, db_root=None): """ Checks if the given article is in the list of pending publications. """ if db_root is None: db_root = config.get('corpus', 'db_path') pubpending = os.path.join(db_root, 'pubpending.txt') with open(pubpending) as db: for line in db: if sep_dir == line.strip(): return True return False
def get_categories(): """ Returns a dictionary of { sep_dir : title } pairs. """ entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt') categories = {} with open(entries) as f: for line in f: sep_dir, rest = line.split('::', 1) category = line.split('::')[-3] category = category.replace(r"\'", "'") categories[sep_dir] = category return categories
def get_status_code(sep_dir, code, log_root=None): """ Checks if the given article has the given status code. """ if log_root is None: log_root = config.get('corpus', 'log_path') log_path = os.path.join(log_root, sep_dir) if os.path.exists(log_path): with open(log_path) as log: for line in log: #use the published flag if ('::' + code) in line: return True return False
def fuzzymatch_new(): """ Writes the fuzzymatch data to the cache specified in the config file. """ fuzzy_path = config.get('corpus', 'fuzzy_path') titles = get_titles() for entry in new_entries(): print entry if '&#' in titles[entry]: print unescape(titles[entry]) matches = fuzzymatch(unescape(titles[entry]).decode('utf8')) else: matches = fuzzymatch(titles[entry]) with open(os.path.join(fuzzy_path, entry), 'wb') as f: writer = csv.writer(f) for match, prob in matches: writer.writerow([match.ID, unidecode(match.label), prob])
def fuzzymatch(string1, string2): """ Takes two strings and performs a fuzzymatch on them. Returns a (confidence, distance) tuple. """ php = PHP("""set_include_path('%(lib_path)s'); require 'fuzzymatch.php';""" % {'lib_path': config.get('general', 'lib_path')}) code = '$string1 = utf8_decode("' + string1.encode('utf8') + '");' code += '$string2 = utf8_decode("' + string2.encode('utf8') + '");' code += 'print fuzzy_match($string1, $string2, 2);' result = php.get_raw(code) confidence, distance = map(float, result.split(',')) return (confidence, distance)
def fuzzymatch(string1, string2): """ Takes two strings and performs a fuzzymatch on them. Returns a (confidence, distance) tuple. """ php = PHP("""set_include_path('%(lib_path)s'); require 'fuzzymatch.php';""" % {'lib_path': config.get('general', 'lib_path')}) code = '$string1 = utf8_decode("' + string1.encode('utf8') + '");' code += '$string2 = utf8_decode("' + string2.encode('utf8') + '");' code += 'print fuzzy_match($string1, $string2, 2);' result = php.get_raw(code) confidence,distance = map(float, result.split(',')) return (confidence, distance)
def new(self): """ Form for creating a new entry """ if not h.auth.is_logged_in(): response.status_int = 401 return "Unauthorized" if not h.auth.is_admin(): response.status_int = 403 return "Forbidden" # initialize template variables c.message = "" c.label = request.params.get('label', None) c.sep_dir = request.params.get('sep_dir', None) c.linklist = [] if c.sep_dir and not c.label: try: c.label = sep.get_title(c.sep_dir) except KeyError: c.message = "Invalid sep_dir: " + c.sep_dir c.sep_dir = "" if c.sep_dir: fuzzypath = config.get('corpus', 'fuzzy_path') fuzzypath = os.path.join(fuzzypath, c.sep_dir) if os.path.exists(fuzzypath): with open(fuzzypath) as f: matches = csv.reader(f) for row in matches: c.linklist.append(row) else: c.message = "Fuzzy match for " + c.sep_dir + " not yet complete." c.linklist.sort(key=lambda x: x[2], reverse=True) return render('entity/new.html')
def new(self): """ Form for creating a new entry """ if not h.auth.is_logged_in(): response.status_int = 401 return "Unauthorized" if not h.auth.is_admin(): response.status_int = 403 return "Forbidden" # initialize template variables c.message = "" c.label = request.params.get("label", None) c.sep_dir = request.params.get("sep_dir", None) c.linklist = [] if c.sep_dir and not c.label: try: c.label = sep.get_title(c.sep_dir) except KeyError: c.message = "Invalid sep_dir: " + c.sep_dir c.sep_dir = "" if c.sep_dir: fuzzypath = config.get("corpus", "fuzzy_path") fuzzypath = os.path.join(fuzzypath, c.sep_dir) if os.path.exists(fuzzypath): with open(fuzzypath) as f: matches = csv.reader(f) for row in matches: c.linklist.append(row) else: c.message = "Fuzzy match for " + c.sep_dir + " not yet complete." c.linklist.sort(key=lambda x: x[2], reverse=True) return render("entity/new.html")
log.write('Pulled: ' + str(rundate) + '\n') def backup(data_path, backup_path, rundate): backup_path = os.path.join(backup_path, rundate + '.tar.gz') tar = tarfile.open(backup_path, 'w:gz') tar.add(data_path, arcname=os.path.basename(data_path)) tar.close() #start date 1970-01-01 for the first pull #currently incremented manually for subsequent pulls, but should read log file if __name__ == '__main__': import sys log_path = config.get('general', 'log_path') philpapers_log = os.path.join(log_path, 'philpapers') backup_path = config.get('general', 'backup_path') philpapers_backup_path = os.path.join(backup_path, 'philpapers') if not os.path.exists(philpapers_backup_path): os.mkdir(philpapers_backup_path) if os.path.exists(philpapers_log): with open(philpapers_log, 'r') as f: lineList = f.readlines() start_date = lineList[-1][8:].strip() else: start_date = "1970-01-01" if sys.argv[-1] != 'philpapers.py':
def pull_from_date(start_date, offset=0, data_path=None, log_path=None, backup_path=None): """ Pulls and pickles all new or changed paper abstracts from start_date until there are no more and then records the date it ran. """ # ensure data dir is proper if data_path is None: data_path = 'philpapers_data' if not os.path.isdir(data_path): os.makedirs(data_path) # ensure log dir is proper if log_path is None: log_path = 'philpapers_log' rundate = strftime("%Y-%m-%d") sys.stderr.write('Starting at ' + start_date + ' to ' + str(rundate) + '\n') if backup_path is not None: backup(data_path, backup_path, rundate) conn = httplib.HTTPConnection("philpapers.org") while(True): request_url= "/utils/export_contents.json?apiId=2956&apiKey=" +\ config.get("philpapers", "apikey") +"&updated=" + str(start_date) + "T12:00:00&offset=" + str(offset) sys.stderr.write('requesting offset: ' + str(offset) + " from " + str(start_date) + " with offset " + str(offset) + '\n') conn.request("GET", request_url) result = conn.getresponse() data_pulled = result.read() try: # check for good data data = json.loads(data_pulled) except ValueError: # wait a bit and try again sleep(10) conn.request("GET", request_url) result = conn.getresponse() data_pulled = result.read() try: # check new data data = json.loads(data_pulled) except ValueError: # two failures sys.stderr.write('terminated with ValueError on second attempt at offset' + str(offset) + '\n') break for entry in data['entries']: entry_file = os.path.join(data_path, entry['id'] + '.json') with open(entry_file, 'w') as f: f.write(json.dumps(entry)) if not data['more']: sys.stderr.write('terminated successfully, no more data available\n') break offset += 100 sleep(10) with open(log_path, 'a+') as log: log.write('Pulled: ' + str(rundate) + '\n')
#!/usr/bin/python if __name__ == "__main__": from inpho.model import * from inpho import config import sys import os.path # initialize a list for people people = [] # open people database and add each person to the people list people_db = os.path.join(config.get("corpus", "db_path"), "people.txt") with open(people_db) as f: for line in f: line = line.split("::") people.append(line) usernames = Session.query(User.username).all() for p in people: # skip incomplete entries if len(p) < 5: print "skipping %s" % p continue #gather data for user creation firstname, lastname, username, password, email = p[:5] # prepend "sep." to the username username = "******" % username print "importing %s" % username
def get_status_code(sep_dir, code, log_root=None): """ Checks if the given article has the given status code. """ if log_root is None: log_root = config.get('corpus', 'log_path')
def article_path(sep_dir): if pending(sep_dir): corpus_root = config.get('corpus', 'edit_path')
if __name__ == '__main__': from argparse import ArgumentParser import sys parser = ArgumentParser() parser.add_argument("-u", "--username", help="HTRC username, default in inpho.config") parser.add_argument("-p", "--password", help="HTRC password, default in inpho.config") parser.add_argument("-v", "--volumes", nargs='+', help="HTRC volume ids") parser.add_argument("--pages", nargs='+', help="HTRC page ids") parser.add_argument("--test", help="test with 2 volumes") parser.add_argument("file", nargs='?', help="input file of ids") parser.add_argument("-o", "--output", required=True, help="output directory") args = parser.parse_args() username = config.get("hathitrust", "username") password = config.get("hathitrust", "password") if args.test: volumeIDs = ["uc2.ark:/13960/t2q52tn56", "uc2.ark:/13960/t2q52xv16"] pageIDs = ["uc2.ark:/13960/t2q52tn56[1,2,3,4,5]", "uc2.ark:/13960/t2q52xv16[33,12,3,4,55]"] elif args.file: with open(args.file) as IDfile: volumeIDs = [line.strip() for line in IDfile] else: volumeIDs = args.volumes if not os.path.isdir(args.output): os.makedirs(args.output)
def apriori(input_filename='output.txt', output_filename='edges.txt'): apriori_bin = config.get('corpus', 'apriori_bin') args = [apriori_bin, input_filename, output_filename, '0.00000000000000001', '0.00000000000000001'] return subprocess.call(args)