Exemplo n.º 1
0
    def dump(self, data, filename = None):

        # Dump to file
        if filename:
            filename = f'{filename}.json'
            self.open(filename, 'write')
            if not self.error:
                try:
                    jdump(data, self.fd)
                    self.fd.write('\n')
                except Exception as e:
                    JSON.error.add(error['json']['save'].format(
                        e = e,
                        filename = self.name
                    ))
            data = None

        # Return dump
        else:
            try:
                data = jdumps(data)
            except Exception as e:
                YAML.error.add(error['json']['dump'].format(e = e))

        return data
Exemplo n.º 2
0
def main(args):

    # Проверяем, всё ли указали для непредобработанных статей
    check_args(args, 'preprocessed', preprocessed_required)

    lemmatized_dict = load_lemmatized(args.lemmatized_path, args.forced)

    all_files = [f for f in os.listdir(args.texts_path)]
    new_files = [file for file in all_files if file not in lemmatized_dict]
    print('Новых текстов: {}'.format(len(new_files)))

    if new_files:

        if args.preprocessed:  # если файлы уже предобработаны
            full_lemmatized_dict = collect_texts(lemmatized_dict,
                                                 args.texts_path, new_files)

        else:

            full_lemmatized_dict, not_lemmatized_texts = process_corpus(
                args.udpipe_path,
                lemmatized_dict,
                args.texts_path,
                new_files,
                keep_pos=args.keep_pos,
                keep_punct=args.keep_punct,
                keep_stops=args.keep_stops)

            if not_lemmatized_texts:
                print('Не удалось разобрать следующие файлы:\n{}'.format(
                    '\n'.join(not_lemmatized_texts)))

        jdump(full_lemmatized_dict,
              open(args.lemmatized_path, 'w', encoding='utf-8'))
def main(stop_after_init=False):
    from sys import argv

    argc = len(argv)
    if argc <= len(CLI_ARGS):
        print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS]))
        print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:])
        exit()

    files = []
    for x in xrange(1, argc-1):
        files.append(argv[x].strip())

    output_path         = argv[-1].strip()

    t_init = time()
    t0 = time()
    print "Loading result files..."
    serps_combined = {}
    for crawl_result_file in files:
        print "Loading", crawl_result_file, "..."
        t1 = time()
        with univ_open(crawl_result_file, 'r') as f:
            merge_serps(serps_combined, jload(f))
        print "Done in", time()-t1
    print "All files done in", time()-t0

    print "Writing URLs to output file", output_path, "..."
    t0 = time()
    jdump(serps_combined, univ_open(output_path, 'w+'))
    print "Done in", time()-t0

    print "Script executed in", time() - t_init, "seconds"
Exemplo n.º 4
0
def download_config(impdb: str, branch: str) -> None:
    """Download the config from Github into the temp directory."""
    requests = Request(META_URL.format(branch, impdb))
    response = requests.get()
    meta = json.loads(response.read())
    tables = meta["tables"]

    sha = get_git_branch_hash(branch)
    # In case we push a new config version to github when the user is downloading
    while True:
        configs = {"_meta": meta}
        for table in tables:
            requests = Request(TABLE_URL.format(branch, impdb, table))
            response = requests.get()
            config = json.loads(response.read())
            configs[table] = config
        sha_check = get_git_branch_hash(branch)

        if sha_check == sha:
            break

        sha = sha_check

    path = config_directory()

    if (path / branch / impdb).exists():
        rmtree(path / branch / impdb)

    (path / branch / impdb).mkdir(parents=True)
    for fname, val in configs.items():
        with (path / branch / impdb / f"{fname}.json").open("w") as f:
            jdump(val, f)

    with (path / branch / impdb / "_hash").open("w") as f:
        f.write(sha)
def download_config(impdb: str) -> None:
    """
    Download the config from Github into the temp directory.
    """
    url = META_URL.format(impdb)
    meta = requests.get(url).json()
    tables = meta["tables"]

    sha = get_git_master_hash()
    # In case we push a new config version to github when the user is downloading
    while True:
        configs = {"_meta": meta}
        for table in tables:
            url = TABLE_URL.format(impdb, table)
            config = requests.get(url).json()
            configs[table] = config
        sha_check = get_git_master_hash()

        if sha_check == sha:
            break

        sha = sha_check

    path = config_directory()

    if (path / impdb).exists():
        rmtree(path / impdb)

    (path / impdb).mkdir(parents=True)
    for fname, json in configs.items():
        with (path / impdb / f"{fname}.json").open("w") as f:
            jdump(json, f)

    with (path / impdb / "_hash").open("w") as f:
        f.write(sha)
Exemplo n.º 6
0
 def _data_dumper(self, data):
     if _OLD_SUGAR_SYSTEM:
         return json.write(data)
     else:
         io = StringIO()
         jdump(data, io)
         return io.getvalue()
Exemplo n.º 7
0
 def del_ban(self, ban_target):
     "Remove a user from the bans and update the file"
     if ban_target not in self.BANS:
         return False
     self.bans.pop(ban_target)
     with open(Path(self.BLACKLIST), 'w') as f:
         jdump(self.BANS, f)
     return True
Exemplo n.º 8
0
def dump(object, filename, quiet=0):
    filetype = filename.split('.')[-1]
    if not quiet: print('Saving %s ...' % filename, end='', file=stderr)
    if filetype == 'json':
        jdump(object, open(filename, 'w'), indent=2, ensure_ascii=0)
    elif filetype == 'dat':
        ndump(open(filename, 'wb'), object)
        if not quiet: print('done', file=stderr)
Exemplo n.º 9
0
def json_dump(data):
    ''' Save data using available JSON tools. '''
    if OLD_SUGAR_SYSTEM is True:
        return json.write(data)
    else:
        io = StringIO()
        jdump(data, io)
        return io.getvalue()
Exemplo n.º 10
0
def json_dump(data):
    """ Save data using available JSON tools. """
    if OLD_SUGAR_SYSTEM is True:
        return json.write(data)
    else:
        _io = StringIO()
        jdump(data, _io)
        return _io.getvalue()
Exemplo n.º 11
0
def json_dump(data):
    """ Save data using available JSON tools. """
    if OLD_SUGAR_SYSTEM is True:
        return json.write(data)
    else:
        _io = StringIO()
        jdump(data, _io)
        return _io.getvalue()
Exemplo n.º 12
0
def json_dump(data):
    """ Save data using available JSON tools. """
    if USING_JSON_READWRITE is True:
        return json.write(data)
    else:
        _io = StringIO()
        jdump(data, _io)
        return _io.getvalue()
Exemplo n.º 13
0
def export_ref_strains(myflu):
	strains = []
	for r in myflu.ref_strains:
		tmp = myflu.sequence_lookup[myflu.node_lookup[r].strain]
		strains.append({'seq': str(tmp.seq), 'date': tmp.date, 'strain': tmp.strain, 'region': tmp.region, 'country':tmp.country})
	from json import dump as jdump
	with open('/Users/yujiazhou/Documents/nextflu/H9_nextflu-master/augur/source-data/'+ myflu.virus_type+'_ref_strains.json', 'w') as ofile:
		jdump(strains, ofile, indent=2)
Exemplo n.º 14
0
 def guardar_datos():
     DDChat.crear_directorios()
     for usuario in DDChat.lista_usuarios:
         with open('./secure_db/usr/'+str(usuario.phone_number), 'w', encoding='utf-8') as archivo:
             jdump(usuario.__dict__, archivo)
     for mensaje in DDChat.lista_mensajes:
         with open('secure_db'+sep+'usr'+sep+DDChat.nombre_archivo(mensaje.date), 'wb') as archivo:
             pdump(mensaje, archivo)
Exemplo n.º 15
0
def export_ref_strains(myflu):
	strains = []
	for r in myflu.ref_strains:
		tmp = myflu.sequence_lookup[myflu.node_lookup[r].strain]
		strains.append({'seq': str(tmp.seq), 'date': tmp.date, 'strain': tmp.strain, 'region': tmp.region, 'country':tmp.country})
	from json import dump as jdump
	with open('source-data/'+ myflu.virus_type+'_ref_strains.json', 'w') as ofile:
		jdump(strains, ofile, indent=2)
Exemplo n.º 16
0
 def add_ban(self, ban_target):
     "Add a user to the bans and dump the dict (True=Added, False=Not)"
     if ban_target in self.BANS:
         return False
     self.BANS[ban_target] = True
     with open(Path(self.BLACKLIST), 'w') as f:
         jdump(self.BANS, f)
     return True
Exemplo n.º 17
0
 def save(self, jsonfile = None):
     outdict = {}
     outdict['blocks'] = [eachblock.dictRepr() for eachblock in self.blocks.values() if eachblock.getID() != 0]
     outdict['questions'] = [eachq.dictRepr() for eachq in self.questions.values()]
     if jsonfile:
         from json import dump as jdump
         with open(jsonfile, 'w') as outfile:
             jdump(outdict, outfile, indent=4)
     return outdict
Exemplo n.º 18
0
 def save(self, filename: Union[Path, str]) -> None:
     '''
     Saves the relation in a file.
     Will save using the json format
     '''
     with open(filename, 'w') as fp:
         from json import dump as jdump
         from typedload import dump
         jdump(dump(self), fp)
Exemplo n.º 19
0
 def save(self, File: Path):
     """
     Saves the Encrypt
  dictionary to a Path
     """
     #if File isn't a path object
     if type(File) != type(Path()): raise TypeError("Non-Path object used")
     with open(File, 'w') as f:
         jdump(self.UserDict, f)
Exemplo n.º 20
0
def common_setup(self):
    self.create_args = generate_args(*self.clist, **self.cdict)
    self.jcreate_args = jdump(self.create_args)
    updated = random_args(self.create_args)
    try:
        updated.update(self.udict)
    except:
        pass
    self.update_args = updated
    self.jupdate_args = jdump(self.update_args)
Exemplo n.º 21
0
def vocaber(file_ids):
    cou = Counter()
    for file_id in file_ids:
        with gzip.open(Path("data") / "processed" /
                       f"{file_id}.json.gz") as fp:
            j = jload(fp)
            cou += Counter([e for l in j.values() for e in l])

    with open("data/vocabulary/vocab.json", "w") as wp:
        jdump(cou, wp)
Exemplo n.º 22
0
    def dump_json(self, data):
        '''
        start making json output file
        '''

        with open(data["Location"]["json"], 'w') as file:
            jdump(data, file, cls=ComplexEncoder)
            if path.exists(data["Location"]["json"]):
                return True
        return False
Exemplo n.º 23
0
 def save(self, filePath=None):
     self._filePath = expanduser(filePath or self._filePath)
     try:
         with open(self._filePath, 'wt') as f:
             jdump(self, f, indent=2)
         self.changed = False
         return True
     except:
         warning("File %s can't be written" % self._filePath)
         return False
Exemplo n.º 24
0
def save_search_cache(search_cache, cache_name="search_cache.json"):
    """
save_search_cache
    description:
        save the search cache to file in .json format. Overwrites any previous existing cache
    parameters:
        search_cache (dict(str:str)) -- search cache mapping compound names to SMILES structures
        [cache_name (str)] -- path to the search cache file location [optional, default="search_cache.json"]
"""
    with open(cache_name, "w") as j:
        jdump(search_cache, j, indent=4)
Exemplo n.º 25
0
def to_json(in_stream, out_stream):
    try:
        obj = ubjload(in_stream)
    except DecoderException as ex:
        __error('Failed to decode ubjson: %s' % ex)
        return 8
    try:
        jdump(obj, out_stream, sort_keys=True)
    except TypeError as ex:
        __error('Failed to encode to sjon: %s' % ex)
        return 16
    return 0
Exemplo n.º 26
0
def to_json(in_stream, out_stream):
    try:
        obj = bjload(in_stream, intern_object_keys=True,object_pairs_hook=OrderedDict)
    except DecoderException as ex:
        __error('Failed to decode bjdata: %s' % ex)
        return 8
    try:
        jdump(obj, out_stream, sort_keys=False, separators=(',', ':'))
    except TypeError as ex:
        __error('Failed to encode to json: %s' % ex)
        return 16
    return 0
Exemplo n.º 27
0
def to_json(in_stream, out_stream):
    try:
        obj = ubjload(in_stream)
    except DecoderException as ex:
        __error('Failed to decode ubjson: %s' % ex)
        return 8
    try:
        jdump(obj, out_stream, sort_keys=True)
    except TypeError as ex:
        __error('Failed to encode to sjon: %s' % ex)
        return 16
    return 0
Exemplo n.º 28
0
 def _load_hero_data(self):
     if self.herojson.is_file():
         with open(self.herojson, 'r') as f:
             self.heroes = jload(f)
     else:
         r = get(f"{self.OPENDOTA_API}/heroes")
         if r.status_code != 200:
             raise IOError("Failed to prefetch Hero JSON data from OpenDota")
         with open(self.herojson, 'w') as f:
             jdump(r.json(), f)
         self.heroes = r.json()
     return
Exemplo n.º 29
0
Arquivo: tests.py Projeto: bowu8/anwen
def common_setup(self):
    self.create_args = generate_args(
        *self.clist, **self.cdict
    )
    self.jcreate_args = jdump(self.create_args)
    updated = random_args(self.create_args)
    try:
        updated.update(self.udict)
    except:
        pass
    self.update_args = updated
    self.jupdate_args = jdump(self.update_args)
Exemplo n.º 30
0
def to_json(in_stream, out_stream):
    try:
        obj = ubjload(in_stream, intern_object_keys=True)
    except DecoderException as ex:
        __error('Failed to decode ubjson: %s' % ex)
        return 8
    try:
        jdump(obj, out_stream, sort_keys=True, separators=(',', ':'))
    except TypeError as ex:
        __error('Failed to encode to json: %s' % ex)
        return 16
    return 0
Exemplo n.º 31
0
def write_to_JSON(result, outfile):
    """Write parsed query result (a Python dict) to JSON file"""
    try:
        with open(outfile, 'w') as json_file:
            jdump(result,
                  json_file,
                  indent=2,
                  sort_keys=False,
                  separators=(',', ':'),
                  ensure_ascii=False)
    except IOError as err:
        raise comparisonException(
            'Writing JSON output to {of} failed with error {ec}.'.format(
                of=outfile, ec=err))
Exemplo n.º 32
0
def main(args):

    # загружаем старый, если уже был какой-то
    mapping = load_mapping(args.mapping_path, args.forced)

    i2lang = 'i2{}'.format(args.lang)
    lang2i = '{}2i'.format(args.lang)
    files = [file for file in os.listdir(args.texts_path)]
    mapping[i2lang] = {i: file for i, file in enumerate(files)}
    mapping[lang2i] = {file: i for i, file in enumerate(files)}
    print('Новый маппинг:')
    print('\t'.join(
        ['{}: {} объекта'.format(k, len(v)) for k, v in mapping.items()]))
    jdump(mapping, open(args.mapping_path, 'w', encoding='utf-8'))
Exemplo n.º 33
0
def get_me(args):
    init()
    status, reason, me = api.get_user()
    if status != http.ok:
        http_error(status, reason)
    if args.key is None:
        jdump(me, sys.stdout, sort_keys=True, indent=2)
        sys.stdout.write('\n')
        sys.stdout.flush()
        return
    if args.key in me:
        sys.stdout.write('{:}\n'.format(me[args.key]))
        sys.stdout.flush()
        return
    error("no such field: {:}".format(args.key))
Exemplo n.º 34
0
def backup(update, context):
    message, bot = update.message, context.bot
    userid = message.from_user['id']
    logger.info("Backing up lists for {}".format(userid))
    ownlists = dbFuncs.getOwnedLists(userid)
    if not ownlists:
        _ = getTranslation(userid)
        message.reply_text(_("notownedlists"))
    backuplist = []
    for ownlist in ownlists:
        backuplist.append(list(Todolist(ownlist[0])))
    with open('{0}/do2backup.json'.format(backupsDir), 'w+') as file:
        jdump(backuplist, file)
    with open('{0}/do2backup.json'.format(backupsDir), 'rb') as file:
        bot.send_document(chat_id=message.from_user['id'], document=file)
Exemplo n.º 35
0
def common_update(self, url, method='PATCH'):
    res = self.fetch(url, method=method, body=jdump(self.update_args))
    if method == 'PATCH':
        assert res.code == 200
    elif method == 'PUT':
        assert res.code == 201
    assert res.body
Exemplo n.º 36
0
def show_text_status(status, title=''):
    from json import dump as jdump
    from sys import stdout

    jtransfers = [{
        'block': key,
        'progress': value
    } for key, value in sorted(status.transfers.items())]
    jdump(
        {
            'title': title,
            'current_index': status.current_index,
            'last_index': status.last_index,
            'transfers': jtransfers
        }, stdout)
    print()
Exemplo n.º 37
0
def writeyaml2json(_filename: str, _yamlfile: str, _indent: int = 2, _sort_keys: bool = True, _minify: bool = False) -> None:
    """Convert YAML to JSON and write the JSON file (overwrites exist JSON files)"""
    ensurefileexists(_yamlfile)
    _tmpyaml: dict = yamlload(_yamlfile)  # nosec
    _out: str = jdump(_tmpyaml, indent=0, separators=(r',', r':'), sort_keys=_sort_keys).replace('\n', r'') if _minify else jdump(_tmpyaml, indent=2, separators=(r', ', r': '), sort_keys=_sort_keys)
    with open(_filename, mode=r'wt', encoding=r'utf-8') as _file:
        _file.write(_out)
Exemplo n.º 38
0
def dump_data_as_text(d, format):
    """ Dumps simple types (dict, iterable, float, int, unicode)
    as: json or plain text (compromise between human readable and parsable form)
    Returns an iterator returning text
    """
    if format == "json":
        if isinstance(d, GeneratorType):
            d = list(d)
        yield jdump(d)
    elif format == "html":
        yield "<html><body>"
        if isinstance(d, dict):
            for k, v in d.iteritems():
                yield '<b>%s</b>: %s<br/>\n'%(k, v)
        else:
            # assume iterable
            yield "<ul>"
            for elt in d:
                yield "<li>%r</li>\n"%elt
            yield "</ul>"
        yield "</body></html>"
    else: # assume "txt"
        if isinstance(d, dict):
            for k, v in d.iteritems():
                yield '%s: %s\n'%(k, v)
        else:
            # assume iterable
            for elt in d:
                if isinstance(elt, (list, tuple)):
                    yield " | ".join(str(e) for e in elt) + "\n"
                else:
                    yield "%s\n"%elt
Exemplo n.º 39
0
def dict2json(_dict: dict, _indent: int = 2, _sort_keys: bool = True, _minify: bool = False) -> str:
    """Convert a Python dictionary to a JSON string

    >>> dict2json({'0': ['Val1', 'Val2', 'Val3', 'Val4'], '1': ['1', '2', '3', '4'], '2': ['5', '6', '7', '8'], '3': ['9', '10', '11', '12'], '4': ['13', '14', '15', '16'], '5': ['17', '18', '19', '20'], '6': ['3.14', '6.28', '2.73', '1.57']}, _sort_keys=True, _minify=True)
    '{"0":["Val1","Val2","Val3","Val4"],"1":["1","2","3","4"],"2":["5","6","7","8"],"3":["9","10","11","12"],"4":["13","14","15","16"],"5":["17","18","19","20"],"6":["3.14","6.28","2.73","1.57"]}'
    """
    return jdump(_dict, indent=0, separators=(r',', r':'), sort_keys=_sort_keys).replace('\n', r'') if _minify else jdump(_dict, indent=_indent, separators=(r', ', r': '), sort_keys=_sort_keys)
Exemplo n.º 40
0
Arquivo: tests.py Projeto: bowu8/anwen
def common_update(self, url, method='PATCH'):
    res = self.fetch(
        url, method=method, body=jdump(self.update_args))
    if method == 'PATCH':
        assert res.code == 200
    elif method == 'PUT':
        assert res.code == 201
    assert res.body
Exemplo n.º 41
0
def csv2json(_list: list, _indent: int = 2, _sort_keys: bool = True, _minify: bool = False) -> str:
    """Convert the specified CSV (as a list) to a JSON string

    >>> csv2json([['Val1', 'Val2', 'Val3', 'Val4'], ['1', '2', '3', '4'], ['5', '6', '7', '8'], ['9', '10', '11', '12'], ['13', '14', '15', '16'], ['17', '18', '19', '20'], ['3.14', '6.28', '2.73', '1.57']], _sort_keys=True, _minify=True)
    '{"0":["Val1","Val2","Val3","Val4"],"1":["1","2","3","4"],"2":["5","6","7","8"],"3":["9","10","11","12"],"4":["13","14","15","16"],"5":["17","18","19","20"],"6":["3.14","6.28","2.73","1.57"]}'
    """
    _dict: dict = {_key: _row for _key, _row in enumerate(_list)}
    return jdump(_dict, indent=0, separators=(r',', r':'), sort_keys=_sort_keys).replace('\n', r'') if _minify else jdump(_dict, indent=_indent, separators=(r', ', r': '), sort_keys=_sort_keys)
Exemplo n.º 42
0
def tokenize(file_id, force):
    """Split raw documents into tokens and store them in
        gzip json files with corresponding file names.
    """
    source_path = Path("data") / "raw" / f"{file_id}.txt"
    target_path = Path("data") / "processed" / f"{file_id}.json.gz"

    if (not target_path.exists()) or force:
        with open(source_path) as fp:
            d = {
                line_id: [word.lower() for word in line.strip().split()]
                for line_id, line in enumerate(fp)
            }

        with gzip.open(target_path, "wt", encoding="ascii") as wp:
            jdump(d, wp)
    else:
        print(f"{target_path} is already there.")
Exemplo n.º 43
0
def write_list(some_list=[]):
    # write list to a file using YAML
        try:
            with open("var/yaml_file.yml", "w") as f:
	            f.write(ydump(
	                some_list,
	                default_flow_style=False,
	                explicit_start=True,
	                width=1,
	                indent=2))
        except IOError as e:
            print "Could not write to file: %s" % e

    # write list to a file using JSON
        try:
            with open("var/json_file.json", "w") as f:
                jdump(some_list, f)
        except IOError as e:
            print "Could not write to file: %s" % e
Exemplo n.º 44
0
def yaml2json(_yaml: str, _indent: int = 2, _sort_keys: bool = True, _minify: bool = False) -> str:
    r"""Convert a YAML string to a JSON string

    >>> yaml2json("'0':\n- Val1\n- Val2\n- Val3\n- Val4\n'1':\n- '1'\n- '2'\n- '3'\n- '4'\n'2':\n- '5'\n- '6'\n- '7'\n- '8'\n'3':\n- '9'\n- '10'\n- '11'\n- '12'\n'4':\n- '13'\n- '14'\n- '15'\n- '16'\n'5':\n- '17'\n- '18'\n- '19'\n- '20'\n'6':\n- '3.14'\n- '6.28'\n- '2.73'\n- '1.57'\n", _sort_keys=True, _minify=True)
    '{"0":["Val1","Val2","Val3","Val4"],"1":["1","2","3","4"],"2":["5","6","7","8"],"3":["9","10","11","12"],"4":["13","14","15","16"],"5":["17","18","19","20"],"6":["3.14","6.28","2.73","1.57"]}'
    >>> yaml2json("'0':\n- Val1\n- Val2\n- Val3\n- Val4\n'1':\n- '1'\n- '2'\n- '3'\n- '4'\n'2':\n- '5'\n- '6'\n- '7'\n- '8'\n'3':\n- '9'\n- '10'\n- '11'\n- '12'\n'4':\n- '13'\n- '14'\n- '15'\n- '16'\n'5':\n- '17'\n- '18'\n- '19'\n- '20'\n'6':\n- '3.14'\n- '6.28'\n- '2.73'\n- '1.57'\n", _sort_keys=True, _minify=False)
        '{\n  "0": [\n    "Val1", \n    "Val2", \n    "Val3", \n    "Val4"\n  ], \n  "1": [\n    "1", \n    "2", \n    "3", \n    "4"\n  ], \n  "2": [\n    "5", \n    "6", \n    "7", \n    "8"\n  ], \n  "3": [\n    "9", \n    "10", \n    "11", \n    "12"\n  ], \n  "4": [\n    "13", \n    "14", \n    "15", \n    "16"\n  ], \n  "5": [\n    "17", \n    "18", \n    "19", \n    "20"\n  ], \n  "6": [\n    "3.14", \n    "6.28", \n    "2.73", \n    "1.57"\n  ]\n}'
    """
    _buf = StringIO(_yaml)
    _tmpyaml: dict = yamlload(_buf)  # nosec
    _out: str = jdump(_tmpyaml, indent=0, separators=(r',', r':'), sort_keys=_sort_keys).replace('\n', r'') if _minify else jdump(_tmpyaml, indent=2, separators=(r', ', r': '), sort_keys=_sort_keys)
    _buf.close()
    return _out
Exemplo n.º 45
0
    print("Configuring keyword service...")
    jesaja = Jesaja()
    jesaja.set_stoplist(STOPLIST_NAME, [stopword.strip()
                                        for stopword in GzipFile(STOPLIST_FILE)])
    jesaja.set_keyword_profile(PROFILE_NAME, PROFILE)
    jesaja.set_matview_profile(
        matview_id=MATVIEW_NAME, profile_name=PROFILE_NAME)

    # check whether we have already shards available for the given matview
    if not jesaja.has_corpus(matview_id=MATVIEW_NAME):
        print("Uploading reference corpus...")
        # we try to rotate the corpus shards until enough documents have been
        # uploaded
        while jesaja.rotate_shard(MATVIEW_NAME) == 0:
            print(" Adding corpus...")
            jesaja.add_documents(
                MATVIEW_NAME, [doc.get_xml_document() for doc in xml_corpus_documents.values()])

    print("Computing keywords...")
    result = jesaja.get_keywords(
        MATVIEW_NAME, [doc.get_xml_document() for doc in xml_corpus_documents.values()])

    for content_id, xml_document in xml_corpus_documents.items():
        xml_document.add_attribute(
            'keywords', '; '.join(result[unicode(content_id)]))

    with GzipFile("results.json.gz", "w") as f:
        jdump([doc.get_xml_document()
               for doc in xml_corpus_documents.values()], f, indent=True)
Exemplo n.º 46
0
def write2json(_filename: str, _dict: dict, _indent: int = 2, _sort_keys: bool = True) -> None:
    """Send data to a new JSON file or overwrite an existing JSON file"""
    with open(_filename, mode=r'wt', encoding=r'utf-8') as _file:
        _file.write(jdump(_dict, indent=2, separators=(r', ', r': '), sort_keys=_sort_keys))
Exemplo n.º 47
0
def json_dump(data):
    """ Save data using available JSON tools. """
    _io = StringIO()
    jdump(data, _io)
    return _io.getvalue()
Exemplo n.º 48
0
def csv2json(_filepath: str) -> str:
    """Convert a specified CSV file to a json string"""
    return jdump(list(creader(open(_filepath, mode='rt', encoding='utf-8'))))
def main(stop_after_init=False):
    from sys import argv

    argc = len(argv)
    if argc <= len(CLI_ARGS):
        print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS]))
        print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:])
        exit()

    serp_result_file            = argv[1].strip()
    url_web_crawl_ids_mapping   = argv[2].strip()
    output_path                 = argv[-1].strip()

    t_init = time()

    print "Loading SERP..."
    with univ_open(serp_result_file, 'r') as f:
        serps = jload(f)
    print "Loaded"

    print "Loading urls-to-ids dict..."
    urls_to_ids = {}
    with univ_open(url_web_crawl_ids_mapping, 'r') as f:
        i = 0
        for line in f:
            line = line.strip().lower().replace("%0a", '')
            urls_to_ids[line] = i
            i += 1
    print "Loaded"

    print "Converting SERP..."
    t0 = time()
    not_converted = set()
    total_urls = set()
    converted_set = set()
    for query_serps in serps.values():
        for serp in query_serps:
            i = 0
            while i < len(serp['results']):
                pos, url = serp['results'][i]
                url = url.lower().replace('%0a', '')
                total_urls.add(url)
                try:
                    serp['results'][i] = (pos, urls_to_ids[url])
                    converted_set.add(url)
                except KeyError as err:
                    # Looks like this URL has not been seen during the web crawl, as it has no assigned ID
                    not_converted.add(url)
                    serp['results'].pop(i)
                    i -= 1
                i += 1
    print "Over", len(total_urls), "total different URLs from the SERP results,", len(not_converted), "could not be converted"
    if len(total_urls) - len(not_converted) < 600:
        print converted_set
    print "Done in", time()-t0

    print "Writing URLs to output file", output_path, "..."
    t0 = time()
    with univ_open(output_path, 'w+') as out:
        jdump(serps, out)
    print "Done in", time()-t0

    print "Script executed in", time() - t_init, "seconds"
Exemplo n.º 50
0
def common_setup(self):
    self.create_args = generate_args(
        *self.clist, **self.cdict
    )
    self.jcreate_args = jdump(self.create_args)
Exemplo n.º 51
0
def write2minijson(_filename: str, _dict: dict, _sort_keys: bool = True) -> None:
    """Send minified JSON data to a new JSON file or overwrite an existing JSON file"""
    with open(_filename, mode=r'wt', encoding=r'utf-8') as _file:
        _file.write(jdump(_dict, indent=0, separators=(r',', r':'), sort_keys=_sort_keys).replace('\n', r''))
Exemplo n.º 52
0
 def _data_dumper(self, data):
     io = StringIO()
     jdump(data, io)
     return io.getvalue()
Exemplo n.º 53
0
def json_dump(data):
    ''' Save data using available JSON tools. '''
    io = StringIO()
    jdump(data, io)
    return io.getvalue()