def metadata(*urns): result = query(fq=['dpaId:"%s"' % urn for urn in urns], wt="json", omitHeaders=True, q='*', fl='rfc4180,dpaId,dpaTitle') return (result["response"]["docs"])
def newest(sourceId="dpa"): result = query(fq=[ "createdAt:[NOW/HOUR-1HOUR TO NOW/HOUR+1HOUR]", 'sourceId:"%s"' % sourceId, ], wt="json", omitHeaders=True, q='*', rows=100, sort="createdAt desc", fl='createdAt,dpaId') return (result["response"]["docs"])
# coding: utf-8 # In[60]: from neofonie import query from collections import defaultdict unknown = query( '*&wt=json&fq=createdAt:[2016-07-22T15:44:39.000Z%20TO%202016-07-23T04:44:39.000Z]&fq=sourceId:"twitter"&facet.mincount=1&fq=labels:"M%C3%BCnchen"&facet=true&facet.field=unknownPersonsSurfaceforms&facet.limit=200&facet.missing=true&f.unknownPersonsSurfaceforms.facet.sort=count&facet.method=enum' ) persons = defaultdict(lambda: 0) personsTable = unknown["facet_counts"]["facet_fields"][ 'unknownPersonsSurfaceforms'] for i in range(0, len(personsTable), 2): persons[str(personsTable[i])] += personsTable[i + 1] # In[86]: #exclude based on reguar expressions import re exclude = { a for a in persons.keys() if re.search(r"\bGmbh\b", a, re.I) or re.search( r"\b(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b", a, re.I) or re.search(r"[:\"]", a) or re.search(r" \. ", a) }
return open(f,*args) for day in range(1,days) : date=startdate-datetime.timedelta(days=day) label=indexlabel.format(**locals()) results=OrderedDict() indexfilename=indexfile.format(**locals()) docs[label]=indexfilename dayquery=copy.deepcopy(basequery) dayquery["fq"].append(date.strftime('createdAt:[%Y-%m-%dT00:00:00.000Z TO %Y-%m-%dT23:59:59.999Z]')) for (k,n) in branchen.items() : nq=copy.deepcopy(dayquery) nq["fq"].append('sectors:"{0}"'.format(k)) res=filter_response(deduplicate(query('*', **nq),attr="dpaId"),filterfunction=lambda a: True) logging.debug("Sector: %s - %s - %s docs" % (k,date.strftime("%Y-%m-%d"),len(res["response"]["docs"]))) if len(res["response"]["docs"])>0 : for d in res["response"]["docs"] : for (old,new) in attributeNames.items() : if old in d : d[new]=copy.deepcopy(d[old]) del d[old] results[k]=res results[k]["label"]=n for nr in results.values() : for doc in nr["response"]["docs"] : filename=docfile.format(**locals())e
def generate(config): texts = {} docs = OrderedDict() for day in range(0, config.days): date = config.startdate - datetime.timedelta(days=day) label = config.indexlabel.format(**locals()) results = OrderedDict() indexfilename = config.indexfile.format(**locals()) docs[label] = indexfilename dayquery = copy.deepcopy(config.basequery) dayquery["fq"].append( date.strftime( 'createdAt:[%Y-%m-%dT00:00:00.000Z TO %Y-%m-%dT23:59:59.999Z]') ) for (k, n) in config.branchen.items(): nq = copy.deepcopy(dayquery) nq["fq"].append('+sectors:"{0}"'.format(k)) res = list( neofonie.query("*", **nq)["response"]["docs"] | datapipeline.rename_attributes(config.rename) | pipe.where(config.filter) | datapipeline.deduplicate(key=lambda a: a["title"]) | datapipeline.default_attributes(('sourcelink', 'source', 'subtitle')) | datapipeline.call(add_sectors_to_subtitle)) logging.debug("Sector: %s - %s - %s docs" % (k, date.strftime("%Y-%m-%d"), len(res))) for item in res: logging.debug( " %s %s %s" % (item["sectors"], item["title"], item["text"][:30])) if len(res) > 0: results[k] = dict(docs=res, label=n) for nr in results.values(): for doc in nr["docs"]: filename = config.docfile.format(**locals()) doc["document"] = filename ndoc = copy.deepcopy(doc) ndoc["index"] = os.path.join("..", indexfilename) ndoc["sector"] = doc["sectors"][0] ndoc["root"] = os.path.join("..", config.rootfile) ndoc["source"] = "ex neoApplication" ndoc["sourcelink"] = "ex neoURL" ndoc["subtitle"] = "Untertitel zu {}".format( ndoc.get("title", "---")) texts[os.path.join(config.directory, filename)] = ndoc if "text" in doc: del (doc["text"]) with mkdirs_and_open(os.path.join(config.directory, indexfilename), "w") as of: json.dump( dict(news=results, root=config.rootfile, rootlabel=config.rootlabel), of) logging.info("%s items written to %s" % (reduce(lambda a, b: a + b, (len(a["docs"]) for a in results.values()), 0), of.name)) for (k, v) in texts.items(): json.dump(v, mkdirs_and_open(k, "w")) logging.debug("%s news objects written" % len(list(texts.keys()))) t = copy.deepcopy(config.template) t["chapters"] = docs json.dump(t, open(os.path.join(config.directory, config.rootfile), "w"))
def neofonie_query(day1,day2,rows_batch): daytext1=str("createdAt:[") daytext2=str("T00:00:00.001Z TO ") daytext3=str("T23:59:59.999Z]") dayframe="".join([daytext1,day1,daytext2,day2,daytext3]) counter=1 start_position=0 numFound="noch nicht bekannt" while numFound=="noch nicht bekannt" or start_position <= numFound: result=(query('*:*', wt="json", fq=[ "sourceId:dpa", dayframe, "-dpaRessort:rs", "-dpaTitle:Tagesvorschau", "-dpaTitle:Abendvorschau", "-dpaTitle:Morgenvorschau", "-dpaTitle:Terminvorschau", "-dpaTitle:DAX", "-dpaTitle:Ausgewählte Investmentfond", "-dpaTitle:*Ausgewählte Auslandsaktien*", "-dpaTitle:EuroStoxx", "-dpaTitle:MDAX", "-dpaTitle:TecDAX", "-dpaTitle:Devisenkurse am", "-dpaId:*dpa-afx*", # "-text:-----------------------", "-text:berichtet dpa heute wie folgt", "-dpaTitle:DGAP-News" ], fl="createdAt,dpaTitle,dpaId,dpaRessort,dpaServices,text,dpaKeywords,dpaService,dpaservices,dpaservice", # "dpaServices", # "createdAt", # "dpaId", sort= "createdAt asc", start=start_position, rows=rows_batch ) ) print("\ndownloaded batch\n") numFound=int(result["response"]["numFound"]) amount_batches=numFound//rows_batch last_batch=numFound%rows_batch amount_batches=numFound//rows_batch last_batch=numFound%rows_batch print("\n Amount of articles:",numFound,"\n") docs=result["response"]["docs"] #For Loop for doc in docs : #print("Schleife fuer Datei {0} fuer Titel {dpaTitle}".format(filename,**d)) # d["dpaTitle"] # ##DPA ID as filename # string_begin_temp =(doc["dpaId"]) # string_begin_temp = string_begin_temp.replace(":","_") # string_begin=string_begin_temp.replace('/', 'v-') # #Writing the file # string_end=".json" # filename="".join([string_begin,string_end]) # foldername=(doc["createdAt"]) # foldername=foldername[0:10] # # filename="/Users/alex/python_project/outputs/DPA-Meldungen/{string_begin}.json".format(**locals()) # # # # # verzeichnisname aus createdAt # # os.makedirectory ??? # # # # try / except # # # file_path = "".join(["/Users/alex/python_project/outputs/DPA-Meldungen/",foldername,"/"]) # try: # os.makedirs(file_path) # except OSError: # if not os.path.isdir(file_path): # raise # with open(file_path+filename, 'w') as f: # json.dump(doc,f) # f.close() # print("\nSaved:", filename,"\n","Article Number",counter) print (doc) insert_dic={ "dpaId":doc["dpaId"], "text":doc["text"], "createdAt":doc["createdAt"], "dpaTitle":doc["dpaTitle"], "dpaRessort":doc["dpaRessort"] #"dpaServices":doc["dpaServices"] } articles.insert(insert_dic) counter=counter+1 #Moving the file #old_position="".join(["/Users/alex/python_project/",filename]) #new_position="".join(["/Users/alex/python_project/outputs/DPA-Meldungen/",filename]) #os.rename(old_position, new_position) #print("\nMoved file",filename) if start_position <= amount_batches*rows_batch: start_position=start_position+rows_batch else: start_position=start_position+rows_batch rows_batch=last_batch print("\n\n**FINISHED**") print (doc)