def dump(files: iter, output_dir: str, unique: bool, update_all: bool): """ Parse MEDLINE XML files into tabular flat-files for each DB table. In addition, a ``delete.txt`` file is generated, containing the PMIDs that should first be deleted from the DB before copying the dump. :param files: a list of XML files to parse (optionally, gzipped) :param output_dir: path to the output directory for the dump :param unique: if ``True`` only VersionId == "1" records are dumped :param update_all: if ``True`` the PMIDs of all parsed records are added to the list of PMIDs for deletion """ out_stream = { Citation.__tablename__: open(join(output_dir, "citations.tab"), "wt"), Abstract.__tablename__: open(join(output_dir, "abstracts.tab"), "wt"), Section.__tablename__: open(join(output_dir, "sections.tab"), "wt"), Descriptor.__tablename__: open(join(output_dir, "descriptors.tab"), "wt"), Qualifier.__tablename__: open(join(output_dir, "qualifiers.tab"), "wt"), Author.__tablename__: open(join(output_dir, "authors.tab"), "wt"), Identifier.__tablename__: open(join(output_dir, "identifiers.tab"), "wt"), Database.__tablename__: open(join(output_dir, "databases.tab"), "wt"), PublicationType.__tablename__: open(join(output_dir, "publication_types.tab"), "wt"), Chemical.__tablename__: open(join(output_dir, "chemicals.tab"), "wt"), Keyword.__tablename__: open(join(output_dir, "keywords.tab"), "wt"), 'delete': open(join(output_dir, "delete.txt"), "wt"), } count = 0 parser = MedlineXMLParser(unique) for f in files: logger.info('dumping %s', f) if f.lower().endswith('.gz'): in_stream = gunzip(f, 'rb') else: in_stream = open(f) count += _dump(in_stream, out_stream, parser, update_all) for stream in out_stream.values(): if stream.tell() == 0: stream.close() remove(join(output_dir, stream.name)) else: stream.close() logger.info("parsed %i records", count)
def _openFile(name): if name.lower().endswith('.gz'): # use wrapper to support pre-3.3 return gunzip(name, 'rb') else: return open(name)
def run_module(): # seed the result dict in the object result = dict( changed=True, ) # define available arguments/parameters a user can pass to the module module = AnsibleModule( argument_spec=dict( state=dict( type='str', default='enabled', required=False, choices=[ 'present', 'absent', 'enabled', 'disabled' ] ), name=dict( type='str', required=True ), kodi_user=dict( type='str', required=False, default='kodi' ), kodi_home=dict( type='str', required=False, default='' ), kodi_release=dict( type='str', required=True ), ), supports_check_mode=True ) # We only work with releases that have the database version 27 # See: https://kodi.wiki/view/Databases#Database_Versions if not SUPPORTED_RELEASES.__contains__(module.params['kodi_release']): module.fail_json( msg='Unsupported kodi release. Supported: %s' % (SUPPORTED_RELEASES), **result ) # Seed the default kodi home if not supplied if module.params['kodi_home']: kodi_home = module.params['kodi_home'] else: kodi_home = "%s/.kodi" % (getpwnam(module.params['kodi_user']).pw_dir) # Seed the default kodi repository kodi_repo = 'http://mirrors.kodi.tv/addons/%s/addons.xml.gz' % (module.params['kodi_release']) # Shortcut for the actual repository, computed removing 'addons.xml(.gz)' repo_base = '/'.join(kodi_repo.split('/')[0:-1]) if module.params['state'] == 'absent': if module.check_mode: # We would only remove the addon if it's in the directory, # or if it has traces in the db result['changed'] = ( exists("%s/addons/%s" % (kodi_home, module.params['name'])) or is_in_db(module.params['name'], kodi_home) ) module.exit_json(**result) result['changed'] = remove_addon(kodi_home, module.params['name']) module.exit_json(**result) elif module.params['state'] == 'disabled': enabled = False elif module.params['state'] in ['present', 'enabled']: enabled = True # If the directory already exists and the enabled status is as desired, then we have nothing to do if exists("%s/addons/%s" % (kodi_home, module.params['name'])): if is_enabled(module.params['name'], kodi_home) == enabled: result['changed'] = False module.exit_json(**result) addons_xml = mktemp() # Download the repository definition, uncompressing it if necessary # This check is left from an attempt to support multiple repositories which might be retried, no need to delete it. if kodi_repo.split('.')[-1] == 'gz': addons_xml_gz = mktemp() download(kodi_repo, addons_xml_gz.name) addons_xml.seek(0) addons_xml.truncate() addons_xml.write(gunzip(addons_xml_gz.name).read()) addons_xml.flush() else: download(kodi_repo, addons_xml.name) # Load the actual xml addons = etree.parse(addons_xml.name) install_addon(repo_base, addons, module.params['name'], module.params['kodi_user'], kodi_home, module.params['kodi_release'], enabled) result['changed'] = True module.exit_json(**result)
#!/usr/bin/env python3 from collections import Counter import csv from gzip import open as gunzip import json import string import re separators = '[{}]'.format(string.punctuation + string.whitespace) queries = set() for i in range(10): filename = 'data/aol/aol-{}.txt.gz'.format(str(i + 1).zfill(2)) with gunzip(filename, 'rt') as file: reader = csv.reader(file, delimiter='\t') _header = next(reader) queries |= set(line[1] for line in reader) data = Counter(item for query in queries for item in re.split(separators, query)) data = dict(data) # filter out stopwords with open('data/stopwords.txt') as f: for word in f: word = word.strip() if word in data: del data[word]
type=float, metavar='min. proportion alt alleles', required=True, help='minimum proportion of alternative alleles to allow') parser.add_argument( '-ly', type=str, metavar='lyrata_only?', required=False, default='false', help='do you want to include lyrata only (true) or not (false)?') args = parser.parse_args() if args.gz == 'true' and args.v[-3:] == '.gz': gzip.gunzip(args.v) lookup_table_file = open( args.v + args.o + "repolarized.lookupKey.minAlleles_" + str(args.mi) + ".txt", 'w') lookup_table_file = open( args.o + "repolarized.lookupKey.minInd_" + str(args.mi) + ".txt", 'w') if args.ly == 'true': args.mi = 2 # args.mi must = 2, since there are only two lyrata samples args.mp = 1.0 count = 0 type_counts = [0, 0, 0, 0] count_file = open(args.o + "counts.txt", 'w')