Exemplo n.º 1
0
def _get_subjects(f):
    items = ijson.items(f, 'cudSubjects.item')
    for subject in ijson.items(f, 'cudSubjects.item'):
        attributes = {a.remote: None for a in cud_attributes}
        attributes.update({a['name']: a['value'] for a in subject['attributes']})
        attributes[cud_id] = int(attributes[cud_id])
        yield attributes
Exemplo n.º 2
0
def do_import(args, db):
    pdb = db()
    friend_name = args.friend_name
    friend = pdb.get_friend_by_name(friend_name)
    if not friend:
        print >> sys.stderr, "No friend by that name, check your spelling or create a new friend using add_friend"
        return False
    friend_id = friend['id']

    print "Importing Authors"
    with open(args.file_name) as import_file:
        authors_to_insert = []
        author_docs = ijson.items(import_file, 'authors.item')
        for author_doc in author_docs:
            authors_to_insert.append(author_doc)
            if len(authors_to_insert) >= INSERT_BATCH_SIZE:
                print "."
                pdb.load_author_documents_from_friend(friend_id, authors_to_insert)
                authors_to_insert = []
        if authors_to_insert:
            pdb.load_author_documents_from_friend(friend_id, authors_to_insert)

    print "Importing Tomes"
    with open(args.file_name) as import_file:
        tomes_to_insert = []
        tome_docs = ijson.items(import_file, 'tomes.item')
        for tome_doc in tome_docs:
            tomes_to_insert.append(tome_doc)
            if len(tomes_to_insert) >= INSERT_BATCH_SIZE:
                print "."
                pdb.load_tome_documents_from_friend(friend_id, tomes_to_insert)
                tomes_to_insert = []

        if tomes_to_insert:
            pdb.load_tome_documents_from_friend(friend_id, tomes_to_insert)
Exemplo n.º 3
0
def parser(base, objconf, skip, **kwargs):
    """ Parses the pipe content

    Args:
        base (str): The base currency (exchanging from)
        objconf (obj): The pipe configuration (an Objectify instance)
        skip (bool): Don't parse the content
        kwargs (dict): Keyword arguments

    Kwargs:
        assign (str): Attribute to assign parsed content (default: exchangerate)
        stream (dict): The original item

    Returns:
        Tuple(dict, bool): Tuple of (item, skip)

    Examples:
        >>> from riko import get_path
        >>> from riko.lib.utils import Objectify
        >>>
        >>> url = get_path('quote.json')
        >>> conf = {'url': url, 'currency': 'USD', 'sleep': 0, 'precision': 6}
        >>> item = {'content': 'GBP'}
        >>> objconf = Objectify(conf)
        >>> kwargs = {'stream': item, 'assign': 'content'}
        >>> result, skip = parser(item['content'], objconf, False, **kwargs)
        >>> result
        Decimal('1.545801')
    """
    if skip:
        rate = kwargs['stream']
    elif objconf.url.startswith('http'):
        get = partial(requests.get, stream=True)
        sget = utils.memoize(utils.HALF_DAY)(get) if objconf.memoize else get
        r = sget(objconf.url, params=objconf.params)
        json = next(items(r.raw, ''))
    else:
        context = utils.SleepyDict(delay=objconf.sleep)
        url = utils.get_abspath(objconf.url)

    try:
        with closing(urlopen(url, context=context)) as f:
            json = next(items(f, ''))
    except TypeError:
        with closing(urlopen(url)) as f:
            json = next(items(f, ''))

    if not skip:
        places = Decimal(10) ** -objconf.precision
        rates = parse_response(json)
        rate = calc_rate(base, objconf.currency, rates, places=places)

    return rate, skip
Exemplo n.º 4
0
def do_import_self(args, db):
    with open(args.file_name) as import_file:
        author_docs = ijson.items(import_file, 'authors.item')
        for author_doc in author_docs:
            print "Author: ", author_doc
            db().load_own_author_document(author_doc)

    with open(args.file_name) as import_file:
        tome_docs = ijson.items(import_file, 'tomes.item')
        for tome_doc in tome_docs:
            print "Tome: ", tome_doc
            db().load_own_tome_document(tome_doc)
Exemplo n.º 5
0
 def handle(self, *args: Any, **options: Any) -> None:
     total_count = 0
     changed_count = 0
     with open(options['dump1'], 'r') as dump1, open(options['dump2'], 'r') as dump2:
         for m1, m2 in zip(ijson.items(dump1, 'item'), ijson.items(dump2, 'item')):
             total_count += 1
             if m1['id'] != m2['id']:
                 self.stderr.write('Inconsistent messages dump')
                 break
             if m1['content'] != m2['content']:
                 changed_count += 1
                 self.stdout.write('Changed message id: {id}'.format(id=m1['id']))
     self.stdout.write('Total messages: {count}'.format(count=total_count))
     self.stdout.write('Changed messages: {count}'.format(count=changed_count))
Exemplo n.º 6
0
    def annotate(self, string, bypass_exceptions=True):
        params = urlencode([("document", string.lower()), ("id", 1)])
        url_string = "{}/disambiguate?{}".format(self.url, params)
        sys.stderr.write(url_string + "\n")

        efforts = 0
        response = None
        while response == None and efforts < 30:
            efforts += 1
            try:
                response = urllib2.urlopen(url_string).read()
            except Exception as e:
                if "[Errno 104]" in str(e):
                    sys.stderr.write("Connection with WSD analyzer reset by peer. Trying again in a minute\n")
                    sleep(60)
                elif "HTTP Error 500" in str(e):
                    sys.stderr.write("WSD analyzer could not analyze this sentence and returned Error 500\n")
                    return " ".join(["{}|-".format(t) for t in string.split()])
                else:
                    sys.exit("Error while talking with WSD: {}".format(e))

        if response == None :
            sys.exit("WSD analyzer did not respond for more than 30 minutes")
        sys.stderr.write(response + "\n")
        responsefile = StringIO.StringIO()
        responsefile.write('{"documents": [' + response + ' ]}')
        responsefile.seek(0)
        items = list(ijson.items(responsefile, "documents.item"))
        sys.stderr.write("items: {}\n".format(items))
        factored_output, replaced_output  = read_wsd_output(items[0], string)
        responsefile.close()
        return factored_output
Exemplo n.º 7
0
def write_data(dict_words):
    items = []
    with open(POSTS_FILE_PATH, 'r') as f:
        for post in ijson.items(f, 'item'):
            labels = [0] * len(CATEGORIES)
            was = False
            for hub in post['hubs']:
                for cur_label, label in enumerate(CATEGORIES):
                    if hub in label:
                        labels[cur_label] = 1
                        was = True

            if not was:
                continue

            words = [0] * BagOfWords.NUM_VOCABULARY_SIZE
            post_words = post['content'] + post['title']
            for word in post_words:
                if word in dict_words:
                    words[dict_words[word]] = 1
                else:
                    words[BagOfWords.NUM_VOCABULARY_SIZE - 1] = 1

            labels = np.packbits(labels).tolist()
            words = np.packbits(words).tolist()
            items.append((labels, words))

    shuffle(items)
    train_set_size = int(len(items) * BagOfWords.TRAIN_RATIO)
    _write_set(TF_ONE_SHOT_TRAIN_FILE_PATH, items[:train_set_size])
    _write_set(TF_ONE_SHOT_EVAL_FILE_PATH, items[train_set_size:])

    print('Set size : ', len(items))
    print('Train set size : ', train_set_size)
    print('Eval set size : ', len(items) - train_set_size)
def get_l1(ss_ontology):
	for item in ijson.items(open(ss_ontology), ''):
		l1 = []
		for x in item["data"]:
			if "level1" in x and x["level1"] not in l1:
				l1.append(x["level1"])
		return l1			
def get_l2(ss_ontology):
	for item in ijson.items(open(ss_ontology), ''):
		l2 = []
		for x in item["data"]:
			if "level2" in x and x["level2"] not in l2:
				l2.append(x["level2"])
		return l2	
Exemplo n.º 10
0
    def parse(self, destination_folder, fname, from_date, to_date):
        self.parsing_started.emit()

        self.fname = fname
        self.destination_folder = destination_folder
        Path(destination_folder).makedirs_p()
        self.from_date = from_date
        self.to_date = to_date

        nb_instances = 0
        success = False
        error_message = ""
        try:
            with open(self.fname, encoding="UTF-8", mode='r') as f:
                items = ijson.items(f, 'item')
                nb_instances = len(list(filter(self.submission_filter, items)))
        except IOError:
            error_message = "Impossible de lire le fichier."
        except ValueError:
            error_message = "Le fichier n'est pas un fichier JSON valide."
        except Exception as e:
            error_message = repr(e)
        else:
            success = True
        finally:
            self.nb_instances = nb_instances
            self.parsing_ended.emit(success, nb_instances, error_message)
Exemplo n.º 11
0
    def iter_data(self, fil, **kwargs):
        '''iterates over file and returns rows with location, date and parameter value'''
        from acacia.data.models import MeetLocatie, Parameter
        
        location = kwargs.get('meetlocatie',None)
        if location and isinstance(location, MeetLocatie):
            location = location.name

        parameter = kwargs.get('parameter',self.parm)
        if parameter and isinstance(parameter, Parameter):
            parameter = parameter.name
            
        for p in ijson.items(fil,'features.item.properties'):
            
            loc = p['Meetpuntcode']
            if location and loc != location:
                continue 
            
            par = p['Parametercode']
            if parameter and par != parameter:
                continue
            
            for d in p['data']:
                try:
                    val = float(d['Waarde'])
                    dat = datetime.datetime.strptime(d['datum'],'%Y-%m-%d')
                    yield (loc,par,dat,val)
                except:
                    # problem with datapoint
                    pass
Exemplo n.º 12
0
def build_fwdtable(V, E, tf, args):
    # Build tables of vertex and edge names for forwarding table parsing
    num_v = {}
    for v in V:
        num_v[v.vnum] = v
    num_e = {}
    for e in E:
        num_e[e.enum] = e

    # Build the forwarding tables
    fwdtable = Fwdtable()

    #TODO: If this actually takes too much time, we need a parser from
    # ijson.parse
    #print 'Parsing tf...'
    for subtable in ijson.items(tf, "item"):
        for vnum in subtable:
            v = num_v[int(vnum)]
            if v not in fwdtable:
                fwdtable[v] = {}
            for entry in subtable[vnum]:
                key, value = entry
                dsti, ctreei, utreei, ei = key
                port, mark_failed = value
                dst = num_v[int(dsti)]
                e = num_e[int(ei)]
                #XXX: TODO: fail less extremely
                assert(e.v1 == v and e.p1 == port or e.v2 == v and e.p2 == port)
                assert(port in v.ports)
                if (dst, ctreei) not in fwdtable[v]:
                    fwdtable[v][dst, ctreei] = []
                fwdtable[v][dst, ctreei].append(((utreei, e), (port, mark_failed)))
    return fwdtable
Exemplo n.º 13
0
def main(argv):
    vocabDict = {}
    labelDict = {}
    with open("labelIndex.json") as f:
        labelDict = json.load(f)
    numTokens = len(labelDict)
    print numTokens
    with open('2mdumpR.json') as f:
        i = 0
        articles = ijson.items(f, 'item')
        finalRow = []
        finalCol = []
        for article in articles:
            if i%1000 == 0:
                print i
            row = []
            col = []
            labels = article['meshMajor']
            for label in labels:
                if labelDict.has_key(label):
                    row.append(i)
                    col.append(labelDict[label])
            finalRow.extend(row)
            finalCol.extend(col)
            i += 1
#            if (i+1)%250000 == 0:
#               break;
    freq = np.ones(len(finalRow))
    output = csr_matrix((freq, (finalRow, finalCol)), shape = (2000000, 26840))
    with open('labelMat.pkl', 'wb') as fp:
        cPickle.dump(output, fp, -1)
Exemplo n.º 14
0
def main():
    if len(sys.argv) != 3:
        print('usage: {0} bulk.json http://localhost:5984/my_db'.format(sys.argv[0]))
        return
    FILE = sys.argv[1]
    DB_NAME = sys.argv[2]
    bulksize = 10000
    f = open(FILE, 'r')

    doc_count = 0
    batch_count = 0
    docs = []
    for doc in items(f, 'docs.item'):
        docs.append(json.dumps(doc, use_decimal=True))
        doc_count += 1
        if doc_count == bulksize:
            send_req(DB_NAME, docs)
            doc_count = 0
            docs = []
            print('finished batch: {0}'.format(batch_count))
            batch_count += 1

    # send remaining docs (if any)
    if len(docs) > 0:
        send_req(DB_NAME, docs)
Exemplo n.º 15
0
def process_provider_into_es(fname, es, conn):
    status = False
    with open(fname, 'r') as infile:
        actions = []
        try:
            for doc in ijson.items(infile, "item"):
                if doc['type'] == 'INDIVIDUAL':
                    action = {
                        "_index": "data",
                        "_type": "provider",
                        "_source": doc
                        }
                else:
                    action = {
                        "_index": "data",
                        "_type": "facility",
                        "_source": doc
                    }
                actions.append(action)
                if len(actions) > 0 and len(actions) % 50 == 0:
                    helpers.bulk(es, actions)
                    status = True
                    actions = []
        except (KeyboardInterrupt, SystemExit):
            conn.rollback()
            raise
        except (UnicodeDecodeError, ValueError, ijson.JSONError):
            print "{0}\n".format(str(ex))
    return status
Exemplo n.º 16
0
def run():
    """Removes all content from database and creates new tables"""
    limit = 0

    with app.app_context():
        f = urlopen(app.config["DATA_URL"])
        objects = items(f, app.config["DATA_LOCATION"])
        row_limit = app.config["ROW_LIMIT"]
        chunk_size = min(row_limit or "inf", app.config["CHUNK_SIZE"])
        debug = app.config["DEBUG"]

        if app.config["TESTING"]:
            createdb()

        for records in utils.chunk(objects, chunk_size):
            count = len(records)
            limit += count
            flattened = [dict(utils.flatten_fields(r)) for r in records]

            if debug:
                print ("Inserting %s records into the database..." % count)
                # pprint(flattened)

            db.engine.execute(Data.__table__.insert(), flattened)

            if row_limit and limit >= row_limit:
                break

        if debug:
            print ("Successfully inserted %s records into the database!" % limit)
Exemplo n.º 17
0
def read_json(filename, mode='rt', encoding=None, prefix=''):
    """
    Iterate over JSON objects matching the field given by ``prefix``.
    Useful for reading a large JSON array one item (with ``prefix='item'``)
    or sub-item (``prefix='item.fieldname'``) at a time.

    Args:
        filename (str): /path/to/file on disk from which json items will be streamed,
            such as items in a JSON array; for example::

                [
                    {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."},
                    {"title": "2BR02B", "text": "Everything was perfectly swell."}
                ]

        mode (str, optional)
        encoding (str, optional)
        prefix (str, optional): if '', the entire JSON object will be read in at once;
            if 'item', each item in a top-level array will be read in successively;
            if 'item.text', each array item's 'text' value will be read in successively

    Yields:
        next matching JSON object; could be a dict, list, int, float, str,
            depending on the value of ``prefix``

    Notes:
        Refer to ``ijson`` at https://pypi.python.org/pypi/ijson/ for usage details.
    """
    with io.open(filename, mode=mode, encoding=encoding) as f:
        if prefix == '':
            yield json.load(f)
        else:
            for item in ijson.items(f, prefix):
                yield item
Exemplo n.º 18
0
def _LoadHistogramList(input_file):
  """Incremental file decoding and JSON parsing when handling new histograms.

  This helper function takes an input file which yields fragments of JSON
  encoded histograms then incrementally builds the list of histograms to return
  the fully formed list in the end.

  Returns
    This function returns an instance of a list() containing dict()s decoded
    from the input_file.

  Raises
    This function may raise ValueError instances if we end up not finding valid
    JSON fragments inside the file.
  """
  try:
    with timing.WallTimeLogger('json.load'):
      def NormalizeDecimals(obj):
        # Traverse every object in obj to turn Decimal objects into floats.
        if isinstance(obj, decimal.Decimal):
          return float(obj)
        if isinstance(obj, dict):
          for k, v in obj.items():
            obj[k] = NormalizeDecimals(v)
        if isinstance(obj, list):
          obj = [NormalizeDecimals(x) for x in obj]
        return obj

      objects = [NormalizeDecimals(x) for x in ijson.items(input_file, 'item')]

  except ijson.JSONError as e:
    # Wrap exception in a ValueError
    raise ValueError('Failed to parse JSON: %s' % (e))

  return objects
Exemplo n.º 19
0
def _get_dataset_indexes():
    train_indexes = set()
    eval_indexes = set()
    with open(POSTS_FILE_PATH, 'r') as f:
        it = 0
        for post_num, post in enumerate(ijson.items(f, 'item')):
            was = False
            for hub in post['hubs']:
                for category_num, category in enumerate(CATEGORIES):
                    if hub in category:
                        was = True

            if not was:
                continue

            if random() > TRAIN_RATIO:
                eval_indexes.add(post_num)
            else:
                train_indexes.add(post_num)

            if DEBUG:
                it += 1
                if it > 100:
                    break

    accepted_posts_num = len(eval_indexes) + len(train_indexes)
    print('eval data set ratio', len(eval_indexes) / float(accepted_posts_num))
    return train_indexes, eval_indexes
Exemplo n.º 20
0
 def handle(self, *args, **options):
     print "Clean old downloaded files"
     os.system("rm %s %s" % (join(settings.MEMOPOL_TMP_DIR, "ep_votes.json"), join(settings.MEMOPOL_TMP_DIR, "ep_votes.json.xz")))
     print "Download vote data from parltrack"
     os.system("wget -O %s http://parltrack.euwiki.org/dumps/ep_votes.json.xz" % join(settings.MEMOPOL_TMP_DIR, "ep_votes.json.xz"))
     print "unxz it"
     os.system("unxz %s" % join(settings.MEMOPOL_TMP_DIR, "ep_votes.json.xz"))
     print "cleaning old votes data..."
     connection.cursor().execute("DELETE FROM votes_recommendationdata")
     transaction.commit_unless_managed()
     print RecommendationData.objects.count()
     print "read file"
     a = 1
     with transaction.commit_on_success():
     # I need to parse the json file by hand, otherwise this eat way to much memory
         for vote in ijson.items(open(join(settings.MEMOPOL_TMP_DIR, "ep_votes.json")), 'item'):
                 RecommendationData.objects.create(proposal_name=vote.get("report", vote["title"]),
                                                  title=vote["title"],
                                                  data=dumps(vote, indent=4),
                                                  date=parse(vote["ts"])),
                 reset_queries()
                 sys.stdout.write("%s\r" % a)
                 sys.stdout.flush()
                 a += 1
         sys.stdout.write("\n")
def main():
    argv = sys.argv[1].split()
    inFile = argv[0]
    outDir = argv[1]
    cpu_count = int([node.split()[1] for node in open(os.environ['PE_HOSTFILE'])][0])
    pool = mp.Pool(cpu_count)
    logger.info("using no. of CPUs: {0}...".format(cpu_count))
    manager = mp.Manager()
    q = manager.Queue()

    abstracts = []
    with open(inFile, 'r') as data:
        objs = ijson.items(data, "item")
        for abstract in objs:
            abstracts.append((abstract["filepath"], abstract["pmid"]))

    logger.info("extracting text from {} abstracts...".format(len(abstracts)))
    jobs = []
    watcher = pool.apply_async(listener, (q, outDir,))

    for filepath, pmid in abstracts:
        job = pool.apply_async(worker, (filepath, pmid, q))
        jobs.append(job)

    for job in jobs:
        job.get()

    logger.info("adding kill to queue...")
    q.put(["KILL"])
    pool.close()
    pool.join()
    logger.info("Job complete.")
Exemplo n.º 22
0
 def test_items(self):
     meta = list(items(StringIO(JSON), 'docs.item.meta'))
     self.assertEqual(meta, [
         [[1], {}],
         {'key': 'value'},
         None,
     ])
Exemplo n.º 23
0
def run():
    """Populates db with most recent data"""
    limit = 0

    with app.app_context():
        f = urlopen(app.config['DATA_URL'])
        objects = items(f, app.config['DATA_LOCATION'])
        row_limit = app.config['ROW_LIMIT']
        chunk_size = min(row_limit or 'inf', app.config['CHUNK_SIZE'])
        debug = app.config['DEBUG']

        if app.config['TESTING']:
            createdb()

        for records in utils.chunk(objects, chunk_size):
            count = len(records)
            limit += count
            flattened = [dict(utils.flatten_fields(r)) for r in records]

            if debug:
                print('Inserting %s records into the database...' % count)
                # pprint(flattened)

            db.engine.execute(Data.__table__.insert(), flattened)

            if row_limit and limit >= row_limit:
                break

        if debug:
            print('Successfully inserted %s records into the database!' % limit)
Exemplo n.º 24
0
def generate_mols_from_json(json):
	j=0
	for item in items(json, "item"):
		j+=1
		#print "  reading",j
		mol = parse_mol_json(item)
		yield mol
Exemplo n.º 25
0
    def exec_worker_map_filter(self, endpoint, args, request):
        """Forward request and process response.

        Forward the request to the third party service, and map the
        response through the ``process`` user function.

        """
        if endpoint != 'search':
            raise APIException("service of type 'map_filter' does "
                               "not support /list")

        if is_https(self.url) and request.method == 'GET':
            method = tls1_get
        else:
            method = getattr(requests, request.method.lower())
        try:
            headers = {'Authorization':
                           request.headers['Authorization']}
        except KeyError:
            headers = {}
        response = method(self.url,
                          params=request.args,
                          headers=headers,
                          stream=True)
        if response.ok:
            path = '.'.join(filter(None, [self.json_path, 'item']))
            results = ijson.items(FileLikeWrapper(response), path)

            return Response(
                result_generator(process_by_client(self, results),
                                 lambda: {}),
                mimetype='application/json')
        else:
            raise APIException('response from external service: {}'
                               .format(response))
Exemplo n.º 26
0
def main(filepath, outputpath):
	include_from = 'Gene Kogan <*****@*****.**>'
	exclude_to = 'GeneKogan<*****@*****.**>'

	output_file = open(outputpath, 'w')	
	input_file = open(filepath)	
#	num_items = len(ijson.items(input_file, 'item'))

	num_found_items = 0
	errors = 0
	for (i,t) in enumerate(ijson.items(input_file, 'item')):
		if (i % 100 == 0):
			print "try email %d, found %d so far" % (i, num_found_items)
		try:
			if t['From'] == include_from and str(t['To'][0]) != exclude_to: 
				for p in t['parts']:
					if p['contentType']=='text/plain':
						content = parse_email_content(t["parts"][0]["content"])
						output_file.write(removeNonAscii(content))
						output_file.flush()
						num_found_items += 1
		except:
			print "Ooops, error "+str(errors)+"...."
			errors += 1
	output_file.close()
	input_file.close()
Exemplo n.º 27
0
 def call(self):
     """
     Makes a request to cghub server.
     Returns generator that returns Result objects.
     """
     self.patch_input_data()
     query = self.build_query()
     url = '%s%s' % (self.server_url, self.uri)
     if query:
         url = '%s?%s' % (url, query)
     xml = self.get_source_file(url)
     if self.format == self.FORMAT_JSON:
         results = ijson.items(xml, 'response.docs.item')
         for item in results:
             yield item
     else:
         # http://docs.python.org/dev/library/xml.dom.pulldom.html
         doc = pulldom.parse(xml)
         for event, node in doc:
             if event == pulldom.START_ELEMENT:
                 if node.tagName == 'doc':
                     doc.expandNode(node)
                     # convert to python object
                     # http://docs.python.org/2/library/xml.etree.elementtree.html
                     result_xml = node.toxml(encoding='utf-8')
                     tree = ElementTree.fromstring(result_xml)
                     result = Result(tree)
                     yield self.patch_result(result, result_xml)
                 elif node.tagName == 'result':
                     self.hits = int(node.getAttribute('numFound'))
Exemplo n.º 28
0
    def executer(self, *args):
        """Execute remotely"""

        options = self.options

        try:
            # from dbnav import daemon
            # if not daemon.is_running(options):
            #     daemon.start_server(options)

            url = 'http://{host}:{port}/{path}'.format(
                host=options.host,
                port=options.port,
                path=COMMANDS[options.prog])
            request = json.dumps(args[1:])

            log.logger.debug('Request to %s:\n%s', url, request)

            response = urllib2.urlopen(url, request)

            for i in ijson.items(response, 'item'):
                yield from_json(i)
        except urllib2.HTTPError as e:
            raise from_json(json.load(e))
        except urllib2.URLError as e:
            log.logger.error('Daemon not available: %s', e)
        except BaseException as e:
            log.logger.exception(e)
Exemplo n.º 29
0
def _get_labels(train_indexes, eval_indexes):
    train_labels = []
    eval_labels = []
    it = 0
    with open(POSTS_FILE_PATH, 'r') as f:
        for post_num, post in enumerate(ijson.items(f, 'item')):
            if DEBUG:
                it += 1
                if it > 100:
                    break

            is_in_train_set = post_num in train_indexes
            is_in_eval_set = post_num in eval_indexes
            if not is_in_train_set and not is_in_eval_set:
                continue

            label = [0.] * len(CATEGORIES)
            for hub in post['hubs']:
                for category_num, category in enumerate(CATEGORIES):
                    if hub in category:
                        label[category_num] = 1

            if is_in_train_set:
                train_labels.append(label)
            else:
                eval_labels.append(label)

    return train_labels, eval_labels
Exemplo n.º 30
0
 def search_and_match(coord_index):
     coord = coordinates[coord_index]
     result = read_search(coord, coord_index)
     reads = ijson.items(HTTPStream(result), 'reads.item')
     reports = coord_indices[coord_index]
     reports_visited = []
     for read in reads:
         read_coord = Coordinate(read['referenceSequenceName'],
                                 read['position'],
                                 read['position']+get_ref_length(read['cigar']))
         covered_reports = [report for report in reports
                                 if (read_coord.chrom == report['chrom'] and
                                     read_coord.start <= report['seqStart'] and
                                     read_coord.end > report['seqEnd'])]
         if covered_reports:
             new_matched_reports = [report for report in covered_reports 
                                     if (report['reportId'] not in reports_visited and
                                         match(report, read))]
             matched_reports.extend(new_matched_reports)
             reports_visited.extend([report['reportId'] for report in covered_reports
                                         if report['reportId'] not in reports_visited])
             #push read into cache:
             read['repository'] = report_set['repository']
             read['readsetId'] = report_set['readsetId']
             read['start'] = read['position']
             read['end'] = read['position'] + get_ref_length(read['cigar'])
             CachedReads.save(read)
         if len(reports_visited) >= len(reports):
             break
Exemplo n.º 31
0
def load_and_write_content(filename, filename2):
    count=0
    file = codecs.open(filename2, 'w', encoding='utf-8')
    with open(filename, 'r') as fd:
        for item in ijson.items(fd, 'item'):
            count+=1
            file.write('[[제목]]: ')
            file.write(item['title'])
            file.write('\n')
            file.write('[[내용]]: \n')
            file.write(item['text'])
            file.write("\n")
    file.close()
    print('contents count=', count)
Exemplo n.º 32
0
def any2dict(f, ext='xml', html5=False, path=None):
    path = path or ''
    if ext in {'xml', 'html'}:
        xml = ext == 'xml'
        root = xml2etree(f, xml, html5).getroot()
        replaced = '/'.join(path.split('.'))
        tree = next(xpath(root, replaced)) if replaced else root
        content = etree2dict(tree)
    elif ext == 'json':
        content = next(items(f, path))
    else:
        raise TypeError('Invalid file type %s' % ext)

    return content
Exemplo n.º 33
0
    def parse_json_items(self, tag, limit=0):
        self.__items = []
        self.__file.seek(0)
        cnt = 0

        objs = ijson.items(self.__file, tag)
        for obj in objs:
            item = json.dumps(obj, sort_keys=True, indent=4, ensure_ascii=True)
            self.__items.append(json.loads(item))
            cnt += 1
            if limit != 0 and cnt >= limit:
                break

        return self.__items
Exemplo n.º 34
0
def get_main_blocks(ast):
    cont_main_blks = []
    with open(ast, 'r') as f:
        objects = ijson.items(f, 'children')
        contract_body = (o for o in objects)
        for child in contract_body:
            d = dict(child[1])
            for i in range(0, len(d['children'])):
                cont_main_blks.append({
                    "id": (d['children'][i])['id'],
                    "name": (d['children'][i])['name'],
                    "src": (d['children'][i])['src']
                })
    return cont_main_blks
Exemplo n.º 35
0
def migrate_data(args, table, config):
    global gargs
    gargs = args
    change_table_info(table, config)

    with open(os.path.join(args.data_dir, "%s.json" % table.name)) as f:
        rows = ijson.items(f, "results.item")
        pool = multiprocessing.Pool()
        result = pool.imap(handle_rows, rows)
        for row in result:
            if isinstance(row, dict):
                handle_fault_data(row)
            else:
                insert_data(row)
Exemplo n.º 36
0
 def parse_data_to_case_class(input):
     conversations = []
     with open(input["data_path"] + ".json", encoding="utf8") as data:
         print("Successfully opened " + input["data_path"] + ".json...")
         for conversation in ijson.items(data,
                                         'conversations.conversation.item'):
             id = conversation["@id"]
             messages = []
             for message in conversation["message"]:
                 messages.append(
                     Message.Message(message["author"], message["time"],
                                     str(message["text"])))
             conversations.append(Conversation.Conversation(id, messages))
     return conversations
Exemplo n.º 37
0
 def _getEsriRESTJSON(self,
                      url,
                      params,
                      attempt=1,
                      useIjson=False,
                      debug=None):
     """Helper function to query an Esri REST endpoint and return json"""
     #Wait five seconds if previous error
     if attempt > 1 and attempt != 6:
         time.sleep(5)
     #Set token if registered with object
     if self.token != None:
         params['token'] = self.token
     #all other attempts...
     if attempt <= 5:
         data = urllib.urlencode(params)
         req = urllib2.Request(url, data)
         try:
             response = urllib2.urlopen(req)
         except httplib.BadStatusLine as e:
             if debug: debug.log("Bad Status Line at attempt %n: %attempt")
             return self._getEsriRESTJSON(url,
                                          params,
                                          attempt + 1,
                                          useIjson=useIjson,
                                          debug=debug)
         except urllib2.HTTPError as e:
             if debug:
                 debug.log("HTTP Error at attempt %n: sleeping" % attempt)
             return self._getEsriRESTJSON(url,
                                          params,
                                          attempt + 1,
                                          useIjson=useIjson,
                                          debug=debug)
         if useIjson:
             if debug: debug.log("Using ijson")
             return ijson.items(response, "features.item")
         else:
             final = json.loads(response.read())
             if 'error' in final.keys():
                 if debug: debug.log("Error in json loads " + str(final))
                 return self._getEsriRESTJSON(url,
                                              params,
                                              attempt + 1,
                                              debug=debug)
             else:
                 return final
     else:
         if debug: debug.log("Too many attempts")
         raise MapServiceError("Error Accessing Map Service " + self.url)
Exemplo n.º 38
0
 def readJsonInput(self, test):
     try:
         # load test specific Dictionary, using Key = func
         # this is to avoid loading very large JSON in memory
         log.debug(" Read JSON Section: " + test)
         jInput = ""
         with open(self.jsonFile, 'rb') as f:
             jInst = ijson.items(f, test)
             for it in jInst:
                 jInput = jInput + json.dumps(it)
         log.debug("Read json JIn: {}".format(jInput))
     except Exception as e:
         printExceptionDetails()
     return jInput
Exemplo n.º 39
0
    def get_moving_average(self, symbol, days):
        history = []
        averages = []
        months_necessary = math.ceil(days / 22)
        raw_history = ijson.items(self.get_history(symbol, months_necessary),
                                  'history.day.item')

        for day in raw_history:
            history.append({'date': day['date'], 'close': day['close']})

        last_elements = list(chain(history[-days:]))
        moving_average = sum([e['close'] for e in last_elements]) / days

        return moving_average
Exemplo n.º 40
0
    def __init__(self, jsonInput):
        with open(jsonInput, 'r') as jsonInputFile:
            # self.jsonDoc = json.load(jsonInputFile.read())
            # self.jsonDoc = json.load(jsonInputFile)
            objects = ijson.items(jsonInputFile, "Document.item")
            blocks = list(objects)
            # testBlock = list(ijson.items(jsonInputFile, "Document.item"))

        jsonInputFile.close()

        self.__rawJson = blocks
        self.__dataFrame = pd.DataFrame()
        # self.__text = ""
        self.__response = ""
Exemplo n.º 41
0
    def read_big_json_file(file_path, prefix=""):
        """ijson读取大文件
        prefix: None 读取全部内容
         prefix:"earth.europe.item" 读取europe中的内容

        {
          "earth": {
            "europe": [
              {
                "name": "Paris",
                "type": "city",
                "info": "aaa"
              },
              {
                "name": "Thames",
                "type": "river",
                "info": "sss"
              },
              {
                "name": "yyy",
                "type": "city",
                "info": "aaa"
              },
              {
                "name": "eee",
                "type": "river",
                "info": "sss"
              }
            ],
            "america": [
              {
                "name": "Texas",
                "type": "state",
                "info": "jjj"
              }
            ]
          }
        }

        """
        with open(file_path, 'r', encoding='utf-8') as f:

            file_gen = ijson.items(f, prefix)

            while True:
                try:
                    print(file_gen.__next__())
                except StopIteration as e:
                    print("数据读取完成")
                    break
Exemplo n.º 42
0
def Deserializer(stream_or_string, **options):

    if isinstance(stream_or_string, six.string_types):
        stream_or_string = six.BytesIO(stream_or_string.encode('utf-8'))
    try:
        objects = ijson.items(stream_or_string, 'item')
        for obj in PythonDeserializer(objects, **options):
            yield obj
    except GeneratorExit:
        raise
    except Exception as e:
        # Map to deserializer error
        six.reraise(DeserializationError, DeserializationError(e),
                    sys.exc_info()[2])
Exemplo n.º 43
0
    def handle(self, *args, **kwargs):
        """Entry point for load data command"""

        self.stdout.write(self.style.SUCCESS('Starting load task:'))

        data_path = DATA_PATH
        if kwargs['sample']:
            data_path = SAMPLE_PATH

        log = self.get_error_log()
        was_error = False

        f = open(data_path)
        objects = ijson.items(f, 'item')

        for o in objects:
            try:
                if not o['value']:
                    o['value'] = DEFAULT_QUESTION_VALUE

                Question.objects.create(
                    question=unescape(o['question']),
                    air_date=o['air_date'],
                    answer=unescape(o['answer']),
                    value=int(o['value'][1:].replace(',', '')),
                    round=o['round'],
                    show_number=o['show_number'],
                )
                print(self.style.SUCCESS('.'),
                      sep=' ',
                      end='',
                      file=sys.stdout,
                      flush=True)
            except Exception as e:
                print(self.style.ERROR('X'),
                      sep=' ',
                      end='',
                      file=sys.stdout,
                      flush=True)
                was_error = True
                log.write('{} - {}\n'.format(e, str(o)))
        if was_error:
            self.stdout.write('')
            raise CommandError(
                'Error loading question data. Check logs/import_data.error for more details'
            )
        else:
            self.stdout.write(self.style.SUCCESS('\nDone'))
        log.close()
        f.close()
Exemplo n.º 44
0
def DumpReader(lang, local_file, from_point=None):
    with bz2.open(local_file, "rb") as fin:
        reader = enumerate(ijson.items(fin, "item"))

        if from_point:
            for i, data in reader:
                if data and isinstance(data, dict) and data.get(
                        'id', None) == from_point:
                    break  # OK. found
                else:
                    continue

        for i, data in reader:
            yield (data, lang, i)
Exemplo n.º 45
0
def split_into_batches(cookbook_file):
    current = []
    count = 0
    with open(cookbook_file, 'r') as json_file:
        for item in ijson.items(json_file, "item"):
            current.append(item)
            if count > 0 and count % BATCH_SIZE == 0:
                print("Finished Item " + str(count))
                with open(
                        BATCH_PATH + str(int(count / BATCH_SIZE - 1)) +
                        OUTFILE, 'w') as outfile:
                    json.dump(current, outfile)
                current = []
            count += 1
Exemplo n.º 46
0
def download_item(url, filename='data.txt', *, num_retries=2, max_page=300, \
        page_size=20, page_no=1, proxy=None):
    """Get items
    """

    if page_no > max_page:
        return

    # check, only one '?'
    if url.count('?') != 1:
        print("Can't use this way, set url to None...")
        url = ''

    hd, tl = url.split('?')
    head = 'https://list.tmall.com/m/search_items.htm?page_size=%d&page_no=%d&' % (page_size, page_no)
    url_req = head + tl

    print('Downloading %s...' % url_req)
    try:
        html_response = urllib.request.urlopen(url_req)  # .read().decode('utf-8')
    except urllib.error.URLError as e:
        print("Downloading error: ", e.reason)
        html_response = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download_item(url, filename, num_retries=num_retries-1,\
                        max_page=max_page, page_size=page_size, page_no=page_no, proxy=proxy)
    print('Request done...')
    #json_html = json.loads(download(""))

    objects = ijson.items(html_response, 'item.item')

    # TODO check Generator empty ??
    #if not list(objects):
    #    return

    items = (o for o in objects if o['item_id'])   # can set filter
    with open(filename, "a+") as f:
        for item in items:
            item_id = item['item_id']
            price = item['price']
            title = item['title']
            print(item_id, title, price, sep=",", file=f)

    # TODO delay
    if page_no % 10 == 0:
        time.sleep(2)

    return download_item(url, filename, num_retries=num_retries, max_page=max_page, \
            page_size=page_size, page_no=page_no+1, proxy=proxy)
Exemplo n.º 47
0
    def __getItems(self, prefix, chunksize):
        if prefix in self.cache:
            return self.cache[prefix]

        # cache miss, gotta parse the JSON again
        ofs = self.fd.tell()
        self.fd.seek(self.startpos)
        items = list(ijson.items(self.fd, prefix))
        self.fd.seek(ofs)

        if chunksize == 0 or len(items) <= chunksize:
            self.cache[prefix] = items

        return items
Exemplo n.º 48
0
def get_paragraph_questions(json_file_name, vocab_file=None):
    """
    :param: json_file_name of the squad data set to parse
    :param: existing vocab to build on
    """
    json_file = open(json_file_name, "r")
    data = []
    if vocab_file is None:
        vocab = set()
    else:
        vocab_file = open(vocab_file, "rb")
        vocab = set(pickle.load(vocab_file))
        vocab_file.close()
    print("Start processing data set %s" % json_file_name)
    for item in ijson.items(json_file, "data.item"):
        for paragraphs in item["paragraphs"]:
            paragraph = paragraphs["context"]
            indices, sentences = extract_sentences(paragraph)
            for qa in paragraphs["qas"]:
                if not qa["is_impossible"]:
                    # add (sentence, question) pairs to data
                    if len(qa["answers"]) != 0:
                        answer = qa["answers"][0]
                        answer_start_index = int(answer["answer_start"])
                        sentence_index = 0
                        for i in range(len(indices)):
                            if answer_start_index > indices[i]:
                                sentence_index = i
                            else:
                                break
                        sentence = sentences[sentence_index]
                        data.append((normalize_string(sentence),
                                     normalize_string(qa["question"])))
                    # add words in question to vocabulary
                    for word in get_words(qa["question"]):
                        vocab.add(word)
            # add words in sentences to vocabulary
            for word in flatten(map(get_words, sentences)):
                vocab.add(word)
    vocab.add(START_TOKEN)
    vocab.add(END_TOKEN)
    vocab.add(UNKNOWN_WORD)
    pickle.dump(list(vocab),
                open(os.path.join(EMBEDDING_DIR, "vocab.pkl"), 'wb'))
    pickle.dump(
        data,
        open(
            os.path.join(EMBEDDING_DIR,
                         "%s.pkl" % json_file_name.split("-")[0]), 'wb'))
    print("Done processing data set %s" % json_file_name)
Exemplo n.º 49
0
def build_index():
    embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
    index = faiss.IndexFlatL2(512)
    docMessages = []
    embeddingtolabelmap = {}
    labeltotextmap = {}
    with open(
            '../../data/codeGraph/stackoverflow_questions_per_class_func_1M_filtered.json',
            'r') as data:
        jsonCollect = ijson.items(data, 'results.bindings.item')
        i = 0
        for jsonObject in jsonCollect:
            objectType = jsonObject['class_func_type']['value'].replace(
                'http://purl.org/twc/graph4code/ontology/', '')
            if objectType != 'Class':
                continue
            label = jsonObject['class_func_label']['value']
            docLabel = label + " docstring " + str(i)
            docStringText = jsonObject['docstr']['value'] + ' ' + str(i)
            soup = BeautifulSoup(docStringText, 'html.parser')
            for code in soup.find_all('code'):
                code.decompose()
            docStringText = soup.get_text()
            embeddedDocText = embed([docStringText])[0]
            newText = np.asarray(embeddedDocText,
                                 dtype=np.float32).reshape(1, -1)
            index.add(newText)
            docMessages.append(embeddedDocText.numpy().tolist())
            embeddingtolabelmap[tuple(
                embeddedDocText.numpy().tolist())] = docLabel
            labeltotextmap[docLabel] = docStringText

            stackLabel = label + " stack " + str(i)
            stackQuestion = jsonObject['content']['value']
            stackAnswer = jsonObject['answerContent']['value']
            stackText = stackQuestion + " " + stackAnswer + ' ' + str(i)
            soup = BeautifulSoup(stackText, 'html.parser')
            for code in soup.find_all('code'):
                code.decompose()
            stackText = soup.get_text()
            embeddedStackText = embed([stackText])[0]
            newStackText = np.asarray(embeddedStackText,
                                      dtype=np.float32).reshape(1, -1)
            index.add(newStackText)
            docMessages.append(embeddedStackText.numpy().tolist())
            embeddingtolabelmap[tuple(
                embeddedStackText.numpy().tolist())] = stackLabel
            labeltotextmap[stackLabel] = stackText
            i += 1
        return (index, docMessages, embeddingtolabelmap, labeltotextmap)
Exemplo n.º 50
0
 def __getEsriRESTJSON(self, url, params, attempt=1, useIjson=False):
     """Helper function to query an Esri REST endpoint and return json"""
     # Wait five seconds if previous error
     if attempt > 1 and attempt != 6:
         time.sleep(5)
     # Set token if registered with object
     if self.token != None:
         params['token'] = self.token
     # all other attempts...
     if attempt <= 5:
         data = urllib.urlencode(params)
         req = urllib2.Request(url, data)
         try:
             response = urllib2.urlopen(req)
         except httplib.BadStatusLine as e:
             self.__logMsg(40, "Bad Status Line at attempt %n: %attempt")
             return self.__getEsriRESTJSON(url,
                                           params,
                                           attempt + 1,
                                           useIjson=useIjson)
         except urllib2.HTTPError as e:
             self.__logMsg(40,
                           "HTTP Error at attempt %n: sleeping" % attempt)
             return self.__getEsriRESTJSON(url,
                                           params,
                                           attempt + 1,
                                           useIjson=useIjson)
         except urllib2.URLError as e:
             self.__logMsg(40, "Verify SSL Cert Error")
             dontVerifySSL()
             return self.__getEsriRESTJSON(url,
                                           params,
                                           attempt + 1,
                                           useIjson=useIjson)
         if useIjson:
             #need to figure out a way to deal with this if error is returned, possibly stop using ijson
             return ijson.items(response, "features.item")
         else:
             final = json.loads(response.read())
             if 'error' in final.keys():
                 self.__logMsg(40, "Error in json loads " + str(final))
                 return self.__getEsriRESTJSON(url, params, attempt + 1)
             elif 'features' in final.keys():
                 return final['features']
             else:
                 return final
     else:
         self.__logMsg(30, "Too many attempts")
         raise MapServiceError("Error Accessing Map Service " + self.url)
Exemplo n.º 51
0
def parser(base, objconf, skip=False, **kwargs):
    """ Parses the pipe content

    Args:
        base (str): The base currency (exchanging from)
        objconf (obj): The pipe configuration (an Objectify instance)
        skip (bool): Don't parse the content
        kwargs (dict): Keyword arguments

    Kwargs:
        assign (str): Attribute to assign parsed content (default: exchangerate)
        stream (dict): The original item

    Returns:
        dict: The item

    Examples:
        >>> from riko import get_path
        >>> from meza.fntools import Objectify
        >>>
        >>> url = get_path('quote.json')
        >>> conf = {'url': url, 'currency': 'USD', 'delay': 0, 'precision': 6}
        >>> item = {'content': 'GBP'}
        >>> objconf = Objectify(conf)
        >>> kwargs = {'stream': item, 'assign': 'content'}
        >>> parser(item['content'], objconf, **kwargs)
        Decimal('1.545801')
    """
    same_currency = base == objconf.currency

    if skip:
        rate = kwargs['stream']
    elif same_currency:
        rate = Decimal(1)
    else:
        decode = objconf.url.startswith('http')

        if objconf.memoize and not objconf.cache_type:
            objconf.cache_type = 'auto'

        with fetch(decode=decode, **objconf) as f:
            json = next(items(f, ''))

    if not (skip or same_currency):
        places = Decimal(10)**-objconf.precision
        rates = parse_response(json)
        rate = calc_rate(base, objconf.currency, rates, places=places)

    return rate
Exemplo n.º 52
0
def read_json(data_result_file_path):
    arr = []
    logging.info('opening {}'.format(data_result_file_path))
    counter = 0
    with open(data_result_file_path, 'r') as json_file:
        items = ijson.items(json_file, 'item')
        for item in items:
            counter += 1
            if 'value_a' in item:
                item['value_a'] = parse_decimal(item['value_a'])
            if 'value_b' in item:
                item['value_b'] = parse_decimal(item['value_b'])
            arr.append(item)
    logging.info('JSON size: {}'.format(counter))
    return arr
Exemplo n.º 53
0
def getcontent():
    global articles
    global headlines
    global summary
    with open('nstream.json', 'r') as stream:
        articlestream = ijson.items(stream, 'articles')
        k = list(articlestream)
        articles = list(k[0])
        headlines = [title['title'] for title in articles]
        summary = [sum['description'] for sum in articles]
        sources = list(set([s["source"]["name"] for s in articles]))
        print('successfully fetched articles')
        print("Total articles", len(headlines))
        print("Sources: ", sources)
        return headlines, summary
Exemplo n.º 54
0
def iter_file(fd, root):
    """Iterate over `root` array in file provided by `filename` using ijson

    :param bytes fd: File descriptor
    :param str root: Array field name inside file
    :return: Iterator of bytes read and item as a tuple

    >>> [r for r in iter_file(open('tests/data/ocds-sample-data.json', 'rb'), 'records')]
    []
    >>> len([r for r in iter_file(open('tests/data/ocds-sample-data.json', 'rb'), 'releases')])
    6
    """
    reader = ijson.items(fd, f"{root}.item", map_type=OrderedDict)
    for item in reader:
        yield item
Exemplo n.º 55
0
def parse_data_to_case_class(input):
    conversations = []
    with open(input["data_path"] + ".json") as data:
        print("Successfully opened " + input["data_path"] + ".json...")
        for root in ijson.items(data, 'conversations.conversation'):
            print("Start to process conversations...")
            for conversation in root:
                id = conversation["@id"]
                messages = []
                for message in conversation["message"]:
                    messages.append(
                        Message(message["author"], message["time"],
                                message["text"]))
                conversations.append(Conversation(id, messages))
    return conversations
Exemplo n.º 56
0
def _read_json(file_path: Path) -> List[Record]:
    data = []

    with open(file_path, "rb") as f:
        objects = ijson.items(f, "fields.item")
        for obj in objects:
            dimensions, marks = {}, {}
            for k, v in obj.items():
                if "D" in k:
                    dimensions[k] = v
                elif "M" in k:
                    marks[k] = v
            data.append(Record(dimensions, marks))

    return data
Exemplo n.º 57
0
    def __init__(self, jsonInput):
        # Read json file as input
        blocks = []
        with open(jsonInput, 'r') as jsonInputFile:
            try:
                objects = ijson.items(jsonInputFile, "Document.item")
                blocks = list(objects)
            except:
                pass

        jsonInputFile.close()

        self.__rawJson = blocks
        self.__dataFrame = pd.DataFrame()
        self.__tableDataFrame = pd.DataFrame()
Exemplo n.º 58
0
 def read_nvd_dir(cls, nvd_dir):
     """
     Iterate over all the CVEs contained in NIST Vulnerability Database
     feeds since NVD_START_YEAR. If the files are missing or outdated in
     nvd_dir, a fresh copy will be downloaded, and kept in .json.gz
     """
     for year in range(NVD_START_YEAR, datetime.datetime.now().year + 1):
         filename = CVE.download_nvd_year(nvd_dir, year)
         try:
             content = ijson.items(gzip.GzipFile(filename), 'CVE_Items.item')
         except:  # noqa: E722
             print("ERROR: cannot read %s. Please remove the file then rerun this script" % filename)
             raise
         for cve in content:
             yield cls(cve)
Exemplo n.º 59
0
def extract_words():
    words = defaultdict(int)
    iteration = 0
    with open(POSTS_FILE_PATH, 'r') as f:
        for post in ijson.items(f, 'item'):
            add_words(words, post['content'], 1)
            add_words(words, post['title'], 2)
            add_words(words, post['tags'], 3)

        iteration += 1
        if not iteration % SHOW_PROGRESS_EVERY:
            format_str = 'Words parsing {} iterations passed'
            print(format_str.format(iteration))

    return words
Exemplo n.º 60
0
def getcontent():
    global articles
    global headlines
    global summary
    with open('nstream.json', 'r') as stream:
        articlestream = ijson.items(stream, 'articles')
        k = list(articlestream)
        articles = list(k[0])
        headlines = [title['title'] for title in articles]
        summary = [sum['description'] for sum in articles]
        print('successfully fetched articles.')
        print("total headlines", len(headlines))
        print("total summaries", len(summary))
        print(" ")
        return headlines, summary