Пример #1
0
for p in glob.glob(glob_expression):
    original_filename = build_original_filename(p)
    path_doc = os.path.join(args.corpus, original_filename)
    f = open(path_doc, 'r')
    d = json.load(f)
    f.close()
    print path_doc

    cpt_doc = 0
    cpt_cut = 0
    len_content = 0
    for url, info in d.iteritems():
        dict_ngram_url = {}
        bs_content = BeautifulSoup(info['content'])
        cut = tbs.cut_bloc(bs_content.body)
        for c in cut:
            cut2bs = tbs.cut_bloc2bs_elt(c)
            for s in cut2bs.strings:
                len_content += len(s)
            cpt_cut += 1
        cpt_doc += 1

    res['global']['nbMessages'] += cpt_doc
    res['global']['nbBlocks'] += cpt_cut
    res['global']['nbCars'] += len_content

    dict_ngram_author = {
        'nbMessages': cpt_doc,
        'nbBlocks': cpt_cut,
        'nbCars': len_content
  print 'OUTPUTDIR %s does not exist, create it or choose an other directory'%(args.diroutput)
  exit(0)

##
# args.fileoutput
##

fileoutput = build_json_filename_output(args.path) if args.fileoutput == '' else args.fileoutput

output_json = os.path.join(args.diroutput, fileoutput)

dict_ngram_author = {}

for url, info in d.iteritems() :
  dict_ngram_url = {}
  bs_content = BeautifulSoup(info['title'] + info['content'])
  cut = tbs.cut_bloc(bs_content.body)
  res['url'][url] = {'global' : True, 'block':[]}
  for c in cut :
    cut2bs  = tbs.cut_bloc2bs_elt(c)
    dict_ngram_block = {}
    content = ' '.join([s.strip() for s in cut2bs.strings])
    tf.ngram_extractor(content, args.sizengram, dict_ngram_author, dict_ngram_url, dict_ngram_block)
    res['url'][url]['block'].append(dict_ngram_block)
  res['url'][url]['global'] = dict_ngram_url
res['global'] = dict_ngram_author

f = open(output_json, 'w')
json.dump(res, f)
f.close()