def query_state_update_via_queue(): print("Query state updating queue is live") query_collection = refer_query_col() mycol = refer_collection() connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING') query_client = QueueClient.from_connection_string(connect_str, "query-queue") while (True): # print('q') time.sleep(200) rows = query_client.receive_messages() for msg in rows: time.sleep(10) row = msg.content print(row) row = ast.literal_eval(row) print('getting_id', row[0]) entry_id = ObjectId(row[0]) query_data_entry = query_collection.find({"_id": entry_id}) data = [i for i in query_data_entry] #check_for_the_completion_of_components try: associated_entries = data[0]['associated_entries'] print('getting associated entries') completed_count = 0 for each_entry_res in associated_entries: res_entry = mycol.find({"_id": each_entry_res}) # print("profile",each_entry_res) data_res = [i for i in res_entry] if (data_res[0]['simplified_dump_state'] == 'Completed'): completed_count += 1 print('completed_count', completed_count) print('entry_count', data[0]['entry_count']) if (completed_count == data[0]['entry_count']): print("All the entries are completed for the query", completed_count) query_collection.update_one( {'_id': entry_id}, {'$set': { 'state': 'Completed' }}) query_client.delete_message(msg) except KeyError as e: print('Query is not yet ready', e) except IndexError as e: print('Yet query entry not available') except Exception as e: print("Exception Occured during dumping ", e)
def project_state_update_via_queue(): print("Project state updating queue is live") proj_collection = refer_projects_col() query_collection = refer_query_col() connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING') project_comp_client = QueueClient.from_connection_string( connect_str, "project-completion-queue") while (True): # print('*') time.sleep(600) rows = project_comp_client.receive_messages() for msg in rows: time.sleep(10) row = msg.content row = ast.literal_eval(row) print(row[0]) entry_id = ObjectId(row[0]) project_data_entry = proj_collection.find({"_id": entry_id}) data = [i for i in project_data_entry] #check_for_the_completion_of_components try: associated_queries = data[0]['associated_queries'] completed_count = 0 for each_query_res in associated_queries: que_entry = query_collection.find({"_id": each_query_res}) data_res = [i for i in que_entry] if (data_res[0]['state'] == 'Completed'): completed_count += 1 print(['comp', completed_count, data[0]['query_count']]) if (completed_count == data[0]['query_count']): print("All the queries are completed for the project", completed_count) proj_collection.update_one( {'_id': entry_id}, {'$set': { 'state': 'Completed' }}) project_comp_client.delete_message(msg) except KeyError as e: print('Project is not yet ready', e) except IndexError as e: print('Yet project entry not available') except Exception as e: print("Exception Occured during dumping ", e)
def get_entries_project(project_id): projects_col= refer_projects_col() query_collection = refer_query_col() proj_data_entry = projects_col.find({"_id": project_id}) proj_data = [i for i in proj_data_entry] proj_attribute_keys = list(proj_data[0].keys()) if ('associated_queries' in proj_attribute_keys): associated_queries = proj_data[0]['associated_queries'] for each_query in associated_queries: query_data_entry = query_collection.find({"_id": ObjectId(each_query)}) query_data = [i for i in query_data_entry] query_attribute_keys = list(query_data[0].keys()) if ('associated_entries' in query_attribute_keys): associated_entries = query_data[0]['associated_entries'] obs_ids = [ObjectId(i) for i in associated_entries] return obs_ids else: print("This project do not have any queries yet") return []
def fix_entry_counts(): print("Query state updating queue is live") query_collection = refer_query_col() mycol = refer_collection() connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING') query_client = QueueClient.from_connection_string(connect_str, "query-queue") rows = query_client.receive_messages() for msg in rows: # time.sleep(10) row = msg.content print(row) row = ast.literal_eval(row) print('getting_id', row[0]) entry_id = ObjectId(row[0]) query_data_entry = query_collection.find({"_id": entry_id}) data = [i for i in query_data_entry] # check_for_the_completion_of_components associated_entries = data[0]['associated_entries'] print('getting associated entries') print(len(associated_entries))
def get_entries_project(project_id): completed_count = [] incomplete_count = 0 incompletes = [] problems = [] all_entires = [] profile_col = refer_collection() projects_col = refer_projects_col() query_collection = refer_query_col() proj_data_entry = projects_col.find({"_id": project_id}) print('proj', proj_data_entry) proj_data = [i for i in proj_data_entry] print('data', len(proj_data)) proj_attribute_keys = list(proj_data[-1].keys()) if ('associated_queries' in proj_attribute_keys): associated_queries = proj_data[-1]['associated_queries'] for each_query in associated_queries: query_data_entry = query_collection.find( {"_id": ObjectId(each_query)}) query_data = [i for i in query_data_entry] print([ query_data[0]['search_query'], query_data[0]['state'], query_data[0]['_id'] ]) query_attribute_keys = list(query_data[0].keys()) if ('associated_entries' in query_attribute_keys): associated_entries = query_data[0]['associated_entries'] # print('kk',associated_entries) obs_ids = [ObjectId(i) for i in associated_entries] all_entires.extend(obs_ids) for k in obs_ids: prof_data_entry = profile_col.find({"_id": k}) # print('proj', proj_data_entry) prof_data = [i for i in prof_data_entry] prof_attribute_keys = list(prof_data[0].keys()) if ('simplified_dump_state' in prof_attribute_keys): if (prof_data[0]['simplified_dump_state'] == 'Completed'): completed_count.append(k) # else:print(prof_data[0]['simplified_dump_state']) elif (prof_data[0]['simplified_dump_state'] == 'Incomplete'): incomplete_count += 1 incompletes.append(k) else: problems.append(k) else: problems.append(k) # # print(['completed',completed_count,'all',len(obs_ids),'incompleted',incomplete_count,incompletes,'prob',problems]) # # filt = [] # # for k in obs_ids: # # if(k not in problems): # # filt.append(k) # # print('filt',filt) # if(completed_count==len(obs_ids)): # query_collection.update_one({'_id': ObjectId(each_query)}, {'$set': {'state': 'Completed'}}) # return obs_ids print('completed_count', len(list(set(completed_count)))) print('incomplete_count', incomplete_count) print('incompletes', list(set(incompletes))) print('problems', list(set(problems))) print('all', all_entires) return { 'incompletes': list(set(incompletes)), 'problems': list(set(problems)) } # all_entires = list(set(all_entires))m # return all_entires else: print("This project do not have any queries yet") return []
def search_a_company(comp_name, db_collection, query_entry): try: sr = getGoogleLinksForSearchText(comp_name + " Australia", 10, 'initial') count = 0 while (sr == 'captcha'): count = count + 1 print('captch detected and sleeping for n times n:', count) time.sleep(1200 * count) sr = getGoogleLinksForSearchText(comp_name, 10, 'initial') b_list_file = open( three_up + '//Simplified_System//Initial_Crawling//black_list.txt', 'r') black_list = b_list_file.read().splitlines() # 'www.dnb.com' received_links = [i['link'] for i in sr] print(received_links) #filter the links filtered_sr_a = [] filtered_received_links = [] for i, each_l in enumerate(received_links): if (('.com/' in each_l) or ('.education/' in each_l) or ('.io/' in each_l) or ('.com.au/' in each_l) or ('.net/' in each_l) or ('.co.nz/' in each_l) or ('.nz/' in each_l) or ('.au/' in each_l) or ('.biz/' in each_l)): # print(each) filtered_received_links.append(each_l) filtered_sr_a.append(sr[i]) print(filtered_sr_a) received_domains = [i.split("/")[2] for i in filtered_received_links] filtered_sr = [] print('rd', received_domains) for i, each in enumerate(received_domains): # print(each) if (('.gov.' in each) or ('org' in each) or ('.govt.' in each) or ('.edu.' in each) or ('.uk' in each)): # filter non wanted websites continue if each not in black_list: filtered_sr.append(filtered_sr_a[i]) if (len(filtered_sr)): #is the link already taken res_data = is_profile_exist(filtered_sr[0]['link']) if (len(res_data)): print("Profile " + filtered_sr[0]['link'] + " already existing at " + str(res_data[0]['_id'])) query_collection = refer_query_col() query_collection.update_one( {'_id': query_entry}, {'$set': { 'associated_entries': [res_data[0]['_id']] }}) return 'exist' #should fix comp name # print('fixing comp name') c_n_link = filtered_sr[0]['link'] c_n_dom = c_n_link.split("/")[2] try: c_name = c_n_dom.split("www.")[1] except IndexError: c_name = c_n_dom if ('.com' in c_name): cc_name = c_name.split(".com")[0] elif ('.org' in c_name): cc_name = c_name.split(".org")[0] elif ('.io' in c_name): cc_name = c_name.split(".io")[0] elif ('.net' in c_name): cc_name = c_name.split(".net")[0] else: cc_name = c_name # print(filtered_sr[0]['link']) filtered_sr[0]['comp_name'] = cc_name filtered_sr[0]['query_id'] = query_entry filtered_sr[0]['ignore_flag'] = '0' rich_description = filtered_sr[0]['rich_description'] if (rich_description != 'None' or rich_description != ''): filtered_sr[0]['description'] = rich_description record_entry = db_collection.insert_one(filtered_sr[0]) print(filtered_sr[0]) print("search record stored in db: ", record_entry.inserted_id) return record_entry.inserted_id else: print("No results found!") return None except Exception as e: print("Error occured! try again", e) return 'error'
def search_a_query(search_query, number_of_results, db_collection, query_entry): try: sr = getGoogleLinksForSearchText(search_query, number_of_results, 'normal') print('came', len(sr)) count = 0 while (sr == 'captcha'): count = count + 1 print('captch detected and sleeping for n times n:', count) time.sleep(1200 * count) sr = getGoogleLinksForSearchText(search_query, number_of_results, 'normal') if (len(sr)): # print(sr) # record_entry = db_collection.insert_many(sr) for each_sr in sr: print(each_sr) received_links = [i['link'] for i in sr] filtered_received_links = [] for each_l in received_links: if (('.com/' in each_l) or ('.education/' in each_l) or ('.io/' in each_l) or ('.com.au/' in each_l) or ('.net/' in each_l) or ('.co.nz/' in each_l) or ('.nz/' in each_l) or ('.au/' in each_l) or ('.biz/' in each_l)): # print(each) filtered_received_links.append(each_l) query_collection = refer_query_col() received_domains = [ i.split("/")[2] for i in filtered_received_links ] print("received_domains", received_domains) received_domains = list(set(received_domains)) print("received_domains", received_domains) ids_list = [] already_existing_count = 0 for k in range(len(received_domains)): time.sleep(10) print(received_links[k], received_domains[k]) b_list_file = open( three_up + '//Simplified_System//Initial_Crawling//black_list.txt', 'r') black_list = b_list_file.read().splitlines() if (received_domains[k] in black_list): #filter non wanted websites continue if (('.gov.' in received_domains[k]) or ('org' in received_domains[k]) or ('.govt.' in received_domains[k]) or ('.edu.' in received_domains[k]) or ('.uk' in received_domains[k])): # filter non wanted websites continue sr = getGoogleLinksForSearchText(received_domains[k], 3, 'initial') if (len(sr) == 0): sr = getGoogleLinksForSearchText(received_domains[k], 3, 'initial') if (len(sr) == 0): sr = getGoogleLinksForSearchText( received_domains[k], 3, 'initial') if (len(sr) > 0): print(sr[0]) res_data = is_profile_exist(sr[0]['link']) if (len(res_data)): print("Profile " + sr[0]['link'] + " already existing at " + str(res_data[0]['_id'])) already_existing_count += 1 #updating associates qq_data_entry = query_collection.find( {"_id": query_entry}) qq_data = [i for i in qq_data_entry] qq_attribute_keys = list(qq_data[0].keys()) if ('associated_entries' in qq_attribute_keys): print('in main', ) query_collection.update_one({'_id': query_entry}, { '$set': { 'associated_entries': qq_data[0]['associated_entries'] + [res_data[0]['_id']] } }) else: query_collection.update_one({'_id': query_entry}, { '$set': { 'associated_entries': [res_data[0]['_id']] } }) continue sr[0]['search_text'] = search_query try: c_name = received_domains[k].split("www.")[1] except IndexError: c_name = received_domains[k] if ('.com' in c_name): sr[0]['comp_name'] = c_name.split(".com")[0] elif ('.org' in c_name): sr[0]['comp_name'] = c_name.split(".org")[0] elif ('.co' in c_name): sr[0]['comp_name'] = c_name.split(".co")[0] elif ('.edu' in c_name): sr[0]['comp_name'] = c_name.split(".edu")[0] else: sr[0]['comp_name'] = c_name print(sr[0]) sr[0]['query_id'] = query_entry sr[0]['ignore_flag'] = '0' rich_description = sr[0]['rich_description'] if (rich_description != 'None' or rich_description != ''): sr[0]['description'] = rich_description record_entry = db_collection.insert_one(sr[0]) print("search record stored in db: ", record_entry.inserted_id) ids_list.append(record_entry.inserted_id) else: print("Cannot find results, skipping company") print('from initial crawling', ids_list) entry_count = already_existing_count + len(ids_list) print("Total entry count", entry_count) query_collection.update_one({'_id': query_entry}, {'$set': { 'entry_count': entry_count }}) return ids_list # print("search records stored in db: ", record_entry.inserted_ids) else: print("No results found!") return None except Exception as e: print("Error occured! try again", e) return 'error' #store file to a csv file # with open('search_results.csv', mode='w',encoding='utf8') as results_file: # store search results in to a csv file # results_writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) # # for each_item in sr: # results_writer.writerow([each_item['title'], each_item['link'], each_item['description']]) # results_file.close() # mycol = refer_collection() # search_a_company('TOOHEYS PTY LIMITED',mycol) # search_a_company('CALTEX PETROLEUM PTY LTD',mycol) # search_a_query('Digital advertisement and marketing analytics services company',5,mycol)
def execute_for_a_company(comp_name): mycol = refer_collection() print("Searching a company") dateTimeObj = datetime.now() query_collection = refer_query_col() data_q = {'started_time_stamp': dateTimeObj, 'search_query': comp_name} record_entry = query_collection.insert_one(data_q) print("Started on", dateTimeObj) started = time.time() print("***Initial Crawling Phrase***") entry_id = search_a_company(comp_name, mycol, record_entry.inserted_id) if (entry_id == None): for i in range(3): print("Initial crawling incomple..retrying", i) entry_id = search_a_company(comp_name, mycol, record_entry.inserted_id) time.sleep(5) if (entry_id != None): break if (entry_id == None): print("Initial crawling incomple..retrying unsuccessful") elif (entry_id == 'exist'): print("Existing profile found. pipeline exits") else: print("entry id received ", entry_id) print("***Deep Crawling Phrase***") deep_crawl([entry_id], 3, 100) print( "Deep crawling completed and record extended with crawled_links,header_text,paragraph_text,social_media_links,telephone numbers,emails,addresses" ) print("***Feature Extraction Phrase***") extract_features([entry_id]) print("***Contact Person Extraction Phrase***") extract_contact_persons([entry_id], 'comp') print(("***Predict the company type***")) predict_class_tags([entry_id]) print(("***Extract crunchbase profile data***")) get_cb_data([entry_id]) print(("***Extract linkedin profile data***")) get_li_data([entry_id]) print(("***Extract opencorporates profile data***")) get_oc_data([entry_id]) print(("***Extract dnb profile data***")) get_dnb_data([entry_id]) print("***Addresses from google***") get_ad_from_google([entry_id]) print("***Addresses extraction completed***") print("***cp from google***") get_cp_from_google([entry_id]) print("***cp extraction completed***") print("***phone numbers from google***") get_tp_from_google([entry_id]) print("***tp extraction completed***") print("***frequent questions google***") get_qa_from_google([entry_id]) print("***qa extraction completed***") print(("***Dumping the results***")) # export_profiles([entry_id],record_entry.inserted_id) simplified_export([entry_id]) ended = time.time() duration = ended - started dateTimeObj_e = datetime.now() completion_data = { 'completed_time_stamp': dateTimeObj_e, 'elapsed_time': duration } print(completion_data) query_collection.update_one({'_id': record_entry.inserted_id}, {'$set': completion_data}) print("Pipeline execution completed, elapsed time:", duration)
rows = ic_client.receive_messages() for msg in rows: # time.sleep(120) row = msg.content row = ast.literal_eval(row) print(row[0]) input_d = row[0].split("--") try: mode = input_d[1] s_text = input_d[0] if mode == 'query': query = s_text.strip() print("Searching a query") dateTimeObj = datetime.now() query_collection = refer_query_col() data_q = {'started_time_stamp': dateTimeObj, 'search_query': query} record_entry = query_collection.insert_one(data_q) print("Started on", dateTimeObj) started = time.time() print("***Initial Crawling Phrase***") entry_id_list = search_a_query(query, 10, mycol, record_entry.inserted_id) if (entry_id_list == None): for i in range(3): print("Initial crawling incomplete..retrying", i) entry_id_list = search_a_query(query, 10, mycol, record_entry.inserted_id) time.sleep(5) if (entry_id_list != None): break if (entry_id_list == None): print("Initial crawling incomplete..retrying unsuccessful") elif (entry_id_list == 'error'):