def obtain_past_articles(start, end): #method to obtain articles for a certain time frame and put them into Mongo global commodity_list global rest_caller global id_list rest_caller = RestCaller(mean_server_url) for commodities in commodity_list: raw_list = alchemy_news_crawler(commodities, start, end) converted_list = convert_to_Infusion_JSON_list(raw_list) #parses over all docs for item in converted_list: r = rest_caller.post(item) r_build = json.loads(r.text) id_list[r_build['_id']] = r_build['url']
def historical_data_process(start, end): """ For reqeusting historical index data from yahoo server. store to mongo db. :param start: start date :param end: end date :return: """ global indexes_list global rest_caller rest_caller = RestCaller(mean_server_url) for index in indexes_list: raw_quote_list = yql_query(index, start, end) converted_list = convert_to_Infusion_JSON_list(raw_quote_list) for item in converted_list: rest_caller.post(item)
def init(): global rest_caller rest_caller = RestCaller(mean_server_url) r = rest_caller.get() text_data = json.loads(r.text) missing_content = dict() for data in text_data: if "Full text not available. Please use associated URL to view full text:" in data["content"]: missing_content[data["_id"]] = data["url"] else: continue if missing_content: alchemy_text_extraction(missing_content) print "\n\n" else: print "No text extractions needed!"
def daily_process(): """ For daily routinely chekcing the server for new data. Store it to mongo db :return: """ global indexes_list global rest_caller rest_caller = RestCaller(mean_server_url) while True: try: for index in indexes_list: raw_quote_list = yql_query(index) converted_list = convert_to_Infusion_JSON_list(raw_quote_list) for item in converted_list: rest_caller.post(item) print "Waiting for next checking period (1hour interval)..." time.sleep(3600) except Exception: print "Network is not stable... retry in next iteration" pass
def json_alchemy_entities(entity_unparsed): global default_date global entity_rest_caller entity_rest_caller = RestCaller(mean_server_entity_url) get_date = entity_unparsed['publicationDate']['date'] if (get_date == ''): datestr_for_infusion = default_date else: timetext = time.strptime(entity_unparsed['publicationDate']['date'],"%Y%m%dT%H%M%S") datestr_for_infusion = time.strftime("%Y-%m-%d", timetext) entity_list = entity_unparsed['enrichedTitle']['entities'] for entity in entity_list: data = { "entityDate":datestr_for_infusion, "text":entity['text'], "count":entity['count'], "sentiment":entity['sentiment']['score'] } entity_rest_caller.post(data)