def add_to_initial_crawling_queue(name_list): mycol = refer_collection() connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING') ic_client = QueueClient.from_connection_string(connect_str, "initial-crawling-queue") for name in name_list: print(name) ic_client.send_message([str(name)])
def run_sequential_crawlers_m(id_list, depth_limit, crawl_limit): #method used to run the crawler """ :param id_list: list of id in the database entries wanted to crawl further :param depth_limit: max depth want to crawl :param crawl_limit: max page count want to crawl :return: """ mycol = refer_collection() # mycol = refer_cleaned_collection() for entry_id in id_list: #going for n depth for the each google search result comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print(data) if (data[0]['link'] == 'None'): continue print(data[0]['link']) print("started", data[0]['_id'], data[0]['link'] + " scraping in to n depth") #configuring the crawlers # lists for collecting crawling data crawled_links = [] header_text = [] paragraph_text = [] telephone_numbers = [] emails = [] social_media_links = [] addresses = [] allowed_domains = data[0]['link'].split("/")[ 2] #getting allowed links from the starting urls itself print('allowed_dm', allowed_domains) custom_settings = { 'DEPTH_LIMIT': #setting depth limit of crawling str(depth_limit), } crawl_limit = crawl_limit # setting crawl limit aka number of links going to crawl yield runner.crawl(NCrawlerSpider, start_urls=[ data[0]['link'], ], allowed_domains=[ allowed_domains, ], custom_settings=custom_settings, crawled_links=crawled_links, header_text=header_text, paragraph_text=paragraph_text, telephone_numbers=telephone_numbers, addresses=addresses, social_media_links=social_media_links, emails=emails, crawl_limit=crawl_limit, entry_id=entry_id) print("one done") # print("reactor is stopping") # reactor.callFromThread(reactor.stop) # print(' reactor stops',threading.currentThread().ident) reactor.stop()
def get_cp_oc(entry_id,mode): # myclient = pymongo.MongoClient("mongodb://localhost:27017/") # mydb = myclient["CompanyDatabase"] # refer the database # mycol = mydb["comp_data"] # refer the collection mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] # comp_name = data[0]['search_text'] try: if mode=='comp': comp_name = data[0]['search_text'] elif mode == 'query': comp_name = data[0]['comp_name'] except KeyError: comp_name = data[0]['link'].split("/")[2] det=[comp_name] sr = getGoogleLinksForSearchText(comp_name + " opencorporates", 3, 'normal') filtered_oc = [] for p in sr: if (('opencorporates.com/companies/nz' in p['link']) or ('opencorporates.com/companies/au' in p['link'])): filtered_oc.append([p['title'], p['link']]) if (len(filtered_oc)): print(filtered_oc[0]) det.append(filtered_oc[0]) det.append(scrape_opencorporates(filtered_oc[0][1])) print(det) mycol.update_one({'_id': entry_id}, {'$set': {'oc_cp_info': det}}) print("Successfully extended the data entry with opencorporates contact person data", entry_id) else: print("No opencorporates profile found!, Try again") mycol.update_one({'_id': entry_id}, {'$set': {'oc_cp_info': det}})
def get_li_url(entry_id): mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] try: sm_links = data[0]['social_media_links'] except Exception: sm_links = [] linked_in_comp_urls = [] for each in sm_links: if('linkedin.com/company' in each):linked_in_comp_urls.append(each) if(len(linked_in_comp_urls)): print("Linkedin profile collected from crawled data") print("linkedin taken from crawling") return linked_in_comp_urls[0] else: comp_name = data[0]['comp_name'] print(data[0]['comp_name']) sr = getGoogleLinksForSearchText( comp_name + " linkedin australia", 5, 'normal') if (len(sr) == 0): sr = getGoogleLinksForSearchText(comp_name + " linkedin australia", 5, 'normal') if (len(sr) == 0): sr = getGoogleLinksForSearchText(comp_name + " linkedin australia", 5, 'normal') filtered_li = [] for p in sr: # print(p['link']) if 'linkedin.com/company' in p['link']: filtered_li.append(p['link']) if (len(filtered_li)): return filtered_li[0] else: print("No linkedin contacts found!, Try again") return False
def get_li_data(id_list): mycol = refer_collection() for entry_id in id_list: time.sleep(60) li_url = get_li_url(entry_id) if (li_url): print(li_url) # blockPrint() comp_li_data = scrape_company(li_url) # enablePrint() # print(comp_li_data) corrected_dict = {k + '_li': v for k, v in comp_li_data.items()} # corrected_dict['li_url']=li_url if ('headquarters_li' in corrected_dict.keys()): # print(corrected_dict['headquarters_li']) if (not isvalid_hq(corrected_dict['headquarters_li'])): corrected_dict = {} # for k in corrected_dict: # print("'"+str(k)+"'") print(corrected_dict) if (len(corrected_dict.keys())): corrected_dict['li_url'] = li_url mycol.update_one({'_id': entry_id}, {'$set': corrected_dict}) print( "Successfully extended the data entry with linkedin profile information", entry_id) else: print("No correct linkedin profile found! dict is empty")
def run_mallet_model( entry_id, number_of_topics ): #this will extract paragraph and header text from given json file and extract the topics from that mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("lda mallet model started", str(data[0]['_id']), data[0]['link']) print('Grabbing paragraph and header text from database...') try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] # do topic extraction on paragraph and header text # print(h_p_data) data_words = list(sent_to_words(h_p_data)) # print("data_words",data_words) print('remove_punctuations...') # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization( data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # print('data_lemmatized...') # Create Dictionary id2word = corpora.Dictionary(data_lemmatized) # print('id2word',id2word) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # print('corpus',corpus) # View print('corpus is created') #(word,frequency of occuring) topics = [] mallet_list = [] mallet_path = 'F:/Armitage_project/crawl_n_depth/utilities/new_mallet/mallet-2.0.8/bin/mallet' # update this path ldamallet = gensim.models.wrappers.LdaMallet( mallet_path, corpus=corpus, num_topics=number_of_topics, id2word=id2word) print('topics are extracting') mallet_list = { 'Topic_' + str(i): [word for word, prob in ldamallet.show_topic(i, topn=10)] for i in range(0, ldamallet.num_topics) } mycol.update_one({'_id': entry_id}, {'$set': { 'mallet_results': mallet_list }}) print(mallet_list) print("Successfully extended the data entry with mallet results", entry_id) except Exception: #handling exceptions if corpus is empty print("corpus is empty or not valid/ mallet model cannot continue") mycol.update_one({'_id': entry_id}, {'$set': {'mallet_results': []}})
def run_lda_model( entry_id, number_of_topics ): #this will extract paragraph and header text from given json file and extract the topics from that mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] #do topic extraction on paragraph and header text print("lda model started " + str(data[0]['_id']), data[0]['link']) print('Grabbing paragraph and header text from database...') # print(h_p_data) data_words = list(sent_to_words(h_p_data)) # print("data_words",data_words) print('remove_punctuations...') # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization( data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # print('data_lemmatized...') # Create Dictionary id2word = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # View print('corpus is created') #(word,frequency of occuring) lda_model = gensim.models.ldamodel.LdaModel( corpus=corpus, id2word=id2word, num_topics=number_of_topics, passes=5, alpha='auto') print('topics are extracting') # topics = lda_model.print_topics() words_list = { 'Topic_' + str(i): [word for word, prob in lda_model.show_topic(i, topn=10)] for i in range(0, lda_model.num_topics) } mycol.update_one({'_id': entry_id}, {'$set': { 'lda_results': words_list }}) print("Successfully extended the data entry with lda results", entry_id) except Exception: #handling exceptions if corpus is empty print("corpus is empty or not valid") mycol.update_one({'_id': entry_id}, {'$set': {'lda_results': []}})
def run_textrank_model( entry_id, phrase_limit ): # this will extract paragraph and header text from given json file and extract the topics from that mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("textrank model started", str(data[0]['_id']), data[0]['link']) try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] # do topic extraction on paragraph and header text combined_text = " ".join(h_p_data) # load a spaCy model, depending on language, scale, etc. nlp = spacy.load("en_core_web_sm") # add PyTextRank to the spaCy pipeline tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) nlp.max_length = 150000000 doc = nlp(combined_text) # examine the top-ranked phrases in the document tr_results = [] tr_words = [] for p in doc._.phrases[:phrase_limit]: tr_results.append([p.rank, p.count, p.text]) tr_words.append(p.text) # print(p.chunks) # summery_res = [] # for sent in doc._.textrank.summary(limit_sentences=summery_limit): # print(sent) # summery_res.append(str(sent)) # print(summery_res) if (len(tr_words)): print(tr_words) mycol.update_one({'_id': entry_id}, {'$set': { 'textrank_results': tr_words }}) print("Successfully extended the data entry with textrank results", entry_id) else: mycol.update_one({'_id': entry_id}, {'$set': { 'textrank_results': [] }}) print("vocabulary is empty") except Exception: mycol.update_one({'_id': entry_id}, {'$set': {'textrank_results': []}}) print("vocabulary is empty") # run_textrank_model("F://Armitage_project//crawl_n_depth//extracted_json_files//0_www.sureway.com.au_data.json",50,5)
def query_state_update_via_queue(): print("Query state updating queue is live") query_collection = refer_query_col() mycol = refer_collection() connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING') query_client = QueueClient.from_connection_string(connect_str, "query-queue") while (True): # print('q') time.sleep(200) rows = query_client.receive_messages() for msg in rows: time.sleep(10) row = msg.content print(row) row = ast.literal_eval(row) print('getting_id', row[0]) entry_id = ObjectId(row[0]) query_data_entry = query_collection.find({"_id": entry_id}) data = [i for i in query_data_entry] #check_for_the_completion_of_components try: associated_entries = data[0]['associated_entries'] print('getting associated entries') completed_count = 0 for each_entry_res in associated_entries: res_entry = mycol.find({"_id": each_entry_res}) # print("profile",each_entry_res) data_res = [i for i in res_entry] if (data_res[0]['simplified_dump_state'] == 'Completed'): completed_count += 1 print('completed_count', completed_count) print('entry_count', data[0]['entry_count']) if (completed_count == data[0]['entry_count']): print("All the entries are completed for the query", completed_count) query_collection.update_one( {'_id': entry_id}, {'$set': { 'state': 'Completed' }}) query_client.delete_message(msg) except KeyError as e: print('Query is not yet ready', e) except IndexError as e: print('Yet query entry not available') except Exception as e: print("Exception Occured during dumping ", e)
def process_data_m(id_list, mode): tagged_data = [] mycol = refer_collection() for entry_id in id_list: comp_data_entry = mycol.find({"_id": entry_id}) data_o = [i for i in comp_data_entry] # print(data_o) try: if (mode == 'test'): class_tag = data_o[0]['comp_name'] if (mode == 'train'): class_tag = data_o[0]['industry'] word_cloud_results = data_o[0]['wordcloud_results_tri'] word_cloud_tokens = [term[0] for term in word_cloud_results] # print(word_cloud_tokens) text_rank_tokens = data_o[0]["textrank_results"] # print(text_rank_tokens) title = data_o[0]["title"].split(" ") meta_description = data_o[0]["description"].split(" ") guided_lda_res = data_o[0]["guided_lda_results"] guided_lda_tokens = [j for i in guided_lda_res for j in i] # print(guided_lda_tokens) lda_topics = data_o[0]["lda_results"] lda_tokens = [] for eac_re in lda_topics: lda_tokens = lda_tokens + lda_topics[eac_re] # print(lda_tokens) kpe_tokens = word_cloud_results = data_o[0]['kpe_results'] # print(kpe_tokens) token_set = lda_tokens + word_cloud_tokens + text_rank_tokens + title + meta_description + guided_lda_tokens + kpe_tokens # combining all features # name_set.append(path_f) tagged_data.append([ data_o[0]["_id"], TaggedDocument(words=token_set, tags=[class_tag]) ]) except KeyError: print("prediction is skipped as features no present, entry id ", entry_id) # print(KeyError) continue return tagged_data
def run_rake_model(entry_id, rake_limit): from nltk.corpus import stopwords stop_words = stopwords.words('english') stop_words.extend(['from', 'subject', 're', 'edu', 'use']) mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("rake model started", str(data[0]['_id']), data[0]['link']) try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] # do topic extraction on paragraph and header text combined_text = " ".join(h_p_data) r = Rake(max_length=3, min_length=1, ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO) r.extract_keywords_from_text(combined_text) res_words = r.get_ranked_phrases() if (len(res_words)): print(res_words[:rake_limit]) mycol.update_one( {'_id': entry_id}, {'$set': { 'rake_results': res_words[:rake_limit] }}) print("Successfully extended the data entry with rake results", entry_id) else: mycol.update_one({'_id': entry_id}, {'$set': {'rake_results': []}}) print("vocabulary is empty") except Exception: mycol.update_one({'_id': entry_id}, {'$set': {'rake_results': []}}) print("vocabulary is empty") # run_rake_model("F://Armitage_project/crawl_n_depth/extracted_json_files/www.axcelerate.com.au_0_data.json",50)
def grab_rich_description(id_list): mycol = refer_collection() for entry_id in id_list: try: rich_description = 'None' # time.sleep(120) print(entry_id) comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] link = data[0]['link'] print(link) browser = use_chrome() browser.get(link) time.sleep(5) pageSource = browser.page_source browser.quit() # browser.close() soup = BeautifulSoup(pageSource, 'html.parser') metas = soup.find_all('meta') # print(metas) meta_description = [meta.attrs['content'] for meta in metas if 'name' in meta.attrs and meta.attrs['name'] == 'description'] og_description = [meta.attrs['content'] for meta in metas if 'property' in meta.attrs and meta.attrs['property'] == 'og:description'] # twitter_description = [meta.attrs['content'] for meta in metas if 'name' in meta.attrs and meta.attrs['name'] == 'twitter:description'] if (meta_description != og_description): rich_description = meta_description + og_description else: rich_description = meta_description rich_description = '_'.join(rich_description) rich_description = rich_description.replace(',', ' ') print('***', rich_description) mycol.update_one({'_id': entry_id}, {'$set': {'rich_description':rich_description}}) except Exception: mycol.update_one({'_id': entry_id}, {'$set': {'rich_description': 'None'}})
def get_li_data_ano(id_list): mycol = refer_collection() for entry_id in id_list: # time.sleep(120) print(entry_id) li_url = get_li_url(entry_id) if (li_url): print(li_url) # blockPrint() comp_li_data = scrape_li_ano(li_url) # enablePrint() # print(comp_li_data) corrected_dict = comp_li_data # for k in corrected_dict: # print("'"+str(k)+"'") # print(corrected_dict) if (len(corrected_dict.keys())): mycol.update_one({'_id': entry_id}, {'$set': corrected_dict}) print( "Successfully extended the data entry with linkedin profile information", entry_id) else: print("No linkedin profile found! dict is empty")
def get_cp_dnb(entry_id,mode): mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) # print(comp_data_entry) data = [i for i in comp_data_entry] # comp_name = data[0]['search_text'] # print(data) try: if mode=='comp': comp_name = data[0]['search_text'] elif mode == 'query': print(data) comp_name = data[0]['comp_name'] except KeyError: comp_name = data[0]['link'].split("/")[2] det = [comp_name] sr = getGoogleLinksForSearchText(comp_name + " dnb.com", 3, 'normal') filtered_dnb = [] for p in sr: if 'dnb.com/business-directory/company-profiles' in p['link']: filtered_dnb.append([p['title'], p['link']]) if (len(filtered_dnb)): print("dnb profile found and extracting contact persons..") print(filtered_dnb[0]) det.append(filtered_dnb[0]) print(filtered_dnb[0][1]) det.append(scrape_dnb(filtered_dnb[0][1])) print(det) mycol.update_one({'_id': entry_id}, {'$set': {'dnb_cp_info': det}}) print("Successfully extended the data entry with dnb contact person data", entry_id) else: print("No dnb profile found!,Try again..") print(det) mycol.update_one({'_id': entry_id}, {'$set': {'dnb_cp_info': det}})
def fix_entry_counts(): print("Query state updating queue is live") query_collection = refer_query_col() mycol = refer_collection() connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING') query_client = QueueClient.from_connection_string(connect_str, "query-queue") rows = query_client.receive_messages() for msg in rows: # time.sleep(10) row = msg.content print(row) row = ast.literal_eval(row) print('getting_id', row[0]) entry_id = ObjectId(row[0]) query_data_entry = query_collection.find({"_id": entry_id}) data = [i for i in query_data_entry] # check_for_the_completion_of_components associated_entries = data[0]['associated_entries'] print('getting associated entries') print(len(associated_entries))
def key_phrase_extract( entry_id, number_of_candidates ): #this will extract paragraph and header text from given json file and extract the topics from that extractor = TopicRank() mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("key_phrase extraction started", str(data[0]['_id']), data[0]['link']) try: h_data = data[0][ "header_text"] # do topic extraction on paragraph and header text data_hp = " ".join(h_data) with open( 'temp_text.txt', 'w', encoding='utf-8' ) as f: #write the extracted header and paragraph text to .txt as this lib_guidedlda only accepts .txt files f.write(data_hp) f.close() extractor.load_document( input='temp_text.txt', language="en", max_length=10000000, #load text file normalization='stemming') #get stop words list stoplist = stopwords.words('english') # select the keyphrase candidates, for TopicRank the longest sequences of # nouns and adjectives extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'}, stoplist=stoplist) # weight the candidates using a random walk. The threshold parameter sets the # minimum similarity for clustering, and the method parameter defines the # linkage method extractor.candidate_weighting(threshold=0.74, method='average') # print the n-highest (10) scored candidates kpe_results = [] for (keyphrase, score) in extractor.get_n_best(n=number_of_candidates, stemming=True): kpe_results.append([keyphrase, score]) print("key phrase extraction completed") # print(kpe_results) kpe_words = [i[0] for i in kpe_results] # print(kpe_words) print(kpe_words) mycol.update_one({'_id': entry_id}, {'$set': { 'kpe_results': kpe_words }}) print("Successfully extended the data entry with kpe results", entry_id) except Exception: #handling exceptions if corpus is empty print("Observations set is empty or not valid") mycol.update_one({'_id': entry_id}, {'$set': {'kpe_results': []}}) return "Observations set is empty or not valid"
def run_sequential_crawlers_m_via_queue_chain( depth_limit, crawl_limit): #method used to run the crawler """ :param depth_limit: max depth want to crawl :param crawl_limit: max page count want to crawl :return: """ try: mycol = refer_collection() connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING') deep_crawling_client = QueueClient.from_connection_string( connect_str, "deep-crawling-queue") f_e_client = QueueClient.from_connection_string( connect_str, "feature-extraction-queue") # mycol = refer_cleaned_collection() while (True): rows = deep_crawling_client.receive_messages() for msg in rows: # time.sleep(120) row = msg.content row = ast.literal_eval(row) print(row[0]) entry_id = ObjectId(row[0]) comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print(data) if (data[0]['link'] == 'None'): continue print(data[0]['link']) print("started", data[0]['_id'], data[0]['link'] + " scraping in to n depth") #configuring the crawlers # lists for collecting crawling data crawled_links = [] header_text = [] paragraph_text = [] telephone_numbers = [] emails = [] social_media_links = [] addresses = [] allowed_domains = data[0]['link'].split("/")[ 2] #getting allowed links from the starting urls itself print('allowed_dm', allowed_domains) custom_settings = { 'DEPTH_LIMIT': #setting depth limit of crawling str(depth_limit), } crawl_limit = crawl_limit # setting crawl limit aka number of links going to crawl yield runner.crawl(NCrawlerSpider, start_urls=[ data[0]['link'], ], allowed_domains=[ allowed_domains, ], custom_settings=custom_settings, crawled_links=crawled_links, header_text=header_text, paragraph_text=paragraph_text, telephone_numbers=telephone_numbers, addresses=addresses, social_media_links=social_media_links, emails=emails, crawl_limit=crawl_limit, entry_id=entry_id) print("completed_crawling and message removing from the queue") deep_crawling_client.delete_message(msg) mycol.update_one( {'_id': entry_id}, {'$set': { 'deep_crawling_state': 'Completed' }}) if (len(paragraph_text) == 0): get_cp_page_data([entry_id]) print("Adding to feature extraction queue") f_e_client.send_message([str(entry_id)], time_to_live=-1) mycol.update_one( {'_id': entry_id}, {'$set': { 'feature_extraction_state': 'Incomplete' }}) # print("reactor is stopping") # reactor.callFromThread(reactor.stop) # print(' reactor stops',threading.currentThread().ident) reactor.stop() except Exception as e: print("Error has occured..try again!", e)
p8 = Process(target=get_aven_data_via_queue) p8.start() p9 = Process(target=get_ad_from_google_via_queue) p9.start() p10 = Process(target=get_dnb_data_via_queue) p10.start() p11 = Process(target=get_tp_from_google_via_queue) p11.start() p12 = Process(target=get_cb_data_via_queue) p12.start() p13 = Process(target=simplified_export_via_queue) p13.start() connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING') ic_client = QueueClient.from_connection_string(connect_str, "initial-crawling-queue") mycol = refer_collection() while (True): rows = ic_client.receive_messages() for msg in rows: # time.sleep(120) row = msg.content row = ast.literal_eval(row) print(row[0]) input_d = row[0].split("--") try: mode = input_d[1] s_text = input_d[0] if mode == 'query': query = s_text.strip() print("Searching a query")
def spider_closed( self, spider): #once spider done with crawling dump data to json files spider.logger.info('Spider closed dumping data: %s', spider.name) print('spider is closed dumping data.....') #remove duplicates self.crawled_links = list(set(self.crawled_links)) self.header_text = list(set(self.header_text)) self.paragraph_text = list(set(self.paragraph_text)) self.emails = list(set(self.emails)) self.addresses = list(set(self.addresses)) # print("old",self.addresses) self.addresses = getting_uniques(self.addresses) self.social_media_links = list(set(self.social_media_links)) self.telephone_numbers = list(set(self.telephone_numbers)) # print(self.social_media_links) fixed_addresses_with_sources = [] for ad in self.addresses: for ad_s in self.addresses_with_links: if (ad in ad_s[0]): # print(ad, ad_s[1]) fixed_addresses_with_sources.append([ad, ad_s[1]]) break fixed_tp_with_sources = [] for tp in self.telephone_numbers: for tp_s in self.telephone_numbers_with_links: if (tp in tp_s[0]): # print(ad, ad_s[1]) fixed_tp_with_sources.append([tp, tp_s[1]]) break fixed_sm_with_sources = [] for sm in self.social_media_links: for sm_s in self.social_media_links_with_links: if (sm in sm_s[0]): # print(sm, sm_s[1]) fixed_sm_with_sources.append([sm, sm_s[1]]) break fixed_em_with_sources = [] for em in self.emails: for em_s in self.emails_with_links: if (em in em_s[0]): # print(ad, ad_s[1]) fixed_em_with_sources.append([em, em_s[1]]) break fixed_ht_with_sources = [] for ht in self.header_text: for ht_s in self.header_text_with_links: if (ht in ht_s[0]): # print(ad, ad_s[1]) fixed_ht_with_sources.append([ht, ht_s[1]]) break fixed_pt_with_sources = [] for pt in self.paragraph_text: for pt_s in self.paragraph_text_with_links: if (pt in pt_s[0]): # print(ad, ad_s[1]) fixed_pt_with_sources.append([pt, pt_s[1]]) break print("with_links", self.addresses_with_links) print("fixed", fixed_addresses_with_sources) n_depth_data = { 'crawled_links': self.crawled_links, 'header_text': self.header_text, 'paragraph_text': self.paragraph_text, 'emails': self.emails, 'addresses': self.addresses, 'social_media_links': self.social_media_links, 'telephone_numbers': self.telephone_numbers, 'website_addresses_with_sources': fixed_addresses_with_sources, 'header_text_with_sources': fixed_ht_with_sources, 'paragraph_text_with_sources': fixed_pt_with_sources, 'emails_with_sources': fixed_em_with_sources, 'social_media_links_with_sources': fixed_sm_with_sources, 'telephone_numbers_with_sources': fixed_tp_with_sources, } # print("size", len(self.paragraph_text)) # print("address",self.addresses) # print("addresses_with_links",fixed_addresses_with_sources) # for i in self.addresses: # for j in self.addresses: # if # print("telephone", self.telephone_numbers) try: mycol = refer_collection() mycol.update_one({'_id': self.entry_id}, {'$set': n_depth_data}) print("Successfully extended the data entry", self.entry_id) except Exception: print("Max document size reached..data is truncated!") n_depth_data['paragraph_text'] = self.paragraph_text[:5000] mycol = refer_collection() mycol.update_one({'_id': self.entry_id}, {'$set': n_depth_data}) print("Successfully extended the data entry", self.entry_id)
def run_wordcloud_model( entry_id, mode ): # this will extract paragraph and header text from given json file and extract the topics from that mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("wordcloud model started", str(data[0]['_id']), data[0]['link']) try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] # do topic extraction on paragraph and header text wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue') # Generate a word cloud data_words = list(sent_to_words(h_p_data)) # data_words_nostops = remove_stopwords(data_words) data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ']) # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ']) # print(data_lemmatized) all_tokens = [j for i in data_lemmatized for j in i] # print('all', all_tokens) all_tokens = [ value for value in all_tokens if (value != 'other' and value != 'day' and value != 'thing' and value != 'last') ] if mode == 'single': combined_text = " ".join(all_tokens) else: if mode == 'bi': finder = BigramCollocationFinder.from_words(all_tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) if mode == 'tri': # print(combined_text) # setup and score the bigrams using the raw frequency. finder = TrigramCollocationFinder.from_words(all_tokens) trigram_measures = TrigramAssocMeasures() scored = finder.score_ngrams(trigram_measures.raw_freq) # print(scored) # By default finder.score_ngrams is sorted, however don't rely on this default behavior. # Sort highest to lowest based on the score. scoredList = sorted(scored, key=itemgetter(1), reverse=True) # print('sclist',scoredList) # word_dict is the dictionary we'll use for the word cloud. # Load dictionary with the FOR loop below. # The dictionary will look like this with the bigram and the score from above. # word_dict = {'bigram A': 0.000697411, # 'bigram B': 0.000524882} word_dict = {} listLen = len(scoredList) # Get the bigram and make a contiguous string for the dictionary key. # Set the key to the scored value. for i in range(listLen - 1): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] # print('dic',word_dict) if mode == 'single': wordcloud.generate(combined_text) else: wordcloud.generate_from_frequencies(word_dict) except Exception: print("cannot make word cloud for empty text") mycol.update_one({'_id': entry_id}, {'$set': { 'wordcloud_results_' + mode: [] }}) print("vocabulary is empty") return "Vocabulary is empty" # Visualize the word cloud wordcloud.to_image() wordcloud_words = [] word_cloud_results = [] for each_res in wordcloud.words_: word_cloud_results.append([each_res, wordcloud.words_[each_res]]) wordcloud_words.append(each_res) # print('words', wordcloud_words) # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis("off") # plt.savefig(name_j[:-4]+"png") # plt.show() print(word_cloud_results) # return wordcloud_words mycol.update_one( {'_id': entry_id}, {'$set': { 'wordcloud_results_' + mode: word_cloud_results }}) print("Successfully extended the data entry with wordcloud results", entry_id) # run_wordcloud_model("F://Armitage_project//crawl_n_depth//extracted_json_files//3_www.hydroterra.com.au_data.json")
def get_missed_to_csv(prof_list): with open('EDU_deep_keywords.csv', mode='w', encoding='utf8', newline='') as results_file: results_writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) results_writer.writerow([ 'link', 'title', 'description', 'rake_results', 'textrank_results', 'kpe_results', 'wordcloud_results', 'IsPrivateEquity' ]) for k in prof_list: search_t = k + ' Australia' mycol = refer_collection() entry = mycol.find({"search_text": search_t}) data = [d for d in entry] iseq = False eq_t = [ 'Investor', 'Invested', 'Private Equity', 'Acquired', 'Allocated capital' ] # print(data)a try: all_text = (',').join(data[0]['paragraph_text']) + (',').join( data[0]['header_text']) for k in eq_t: if k in all_text: iseq = True break row = [ data[0]['link'], data[0]['title'], data[0]['description'], data[0]['rake_results'][:10], data[0]['textrank_results'][:10], data[0]['kpe_results'][:10], [k[0] for k in data[0]['wordcloud_results_tri'][:10]], iseq ] except Exception as e: print(k) p = k.split('//')[1].replace('www.', '') c_name = p.split('.com')[0] entry = mycol.find({"comp_name": c_name}) data = [d for d in entry] try: all_text = (',').join(data[0]['paragraph_text']) + ( ',').join(data[0]['header_text']) for k in eq_t: if k in all_text: iseq = True break row = [ data[0]['link'], data[0]['title'], data[0]['description'], data[0]['rake_results'][:10], data[0]['textrank_results'][:10], data[0]['kpe_results'][:10], [k[0] for k in data[0]['wordcloud_results_tri'][:10]], iseq ] except Exception as e: print(k, "******") continue # q = k + ' --comp' # add_to_initial_crawling_queue([q]) results_writer.writerow(row) results_file.close() # get_missed_to_csv(missed_list)
def run_guided_lda_model( entry_id, number_of_topics ): #this will extract paragraph and header text from given json file and extract the topics from that mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("guided lda model started", str(data[0]['_id']), data[0]['link']) try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] # do topic extraction on paragraph and header text combined_text = " ".join(h_p_data) doc_list = [combined_text] token_vectorizer = CountVectorizer(doc_list, stop_words=stop_words) X = token_vectorizer.fit_transform(doc_list) tf_feature_names = token_vectorizer.get_feature_names() word2id = dict((v, idx) for idx, v in enumerate(tf_feature_names)) seed_topic_list = [[ 'about', 'vision', 'mission', 'approach', 'team', 'clients' ], ['course', 'service', 'work', 'task'], [ 'address', 'australia', 'contact', 'email', 'location', 'call', 'social' ]] number_of_topics = number_of_topics model = guidedlda.GuidedLDA(n_topics=number_of_topics, n_iter=100, random_state=7, refresh=20) seed_topics = {} for t_id, st in enumerate(seed_topic_list): for word in st: if (word in word2id.keys()): seed_topics[word2id[word]] = t_id else: try: word2id[word] = str(int(list(word2id.keys())[-1]) + 1) seed_topics[word2id[word]] = t_id except ValueError: pass model.fit(X, seed_topics=seed_topics, seed_confidence=0.35) n_top_words = 10 topic_word = model.topic_word_ topics_set = [] for i, topic_dist in enumerate(topic_word): topic_words = np.array(tf_feature_names)[np.argsort( topic_dist)][:-(n_top_words + 1):-1] words = [w for w in topic_words] topics_set.append(words) # print('Topic {}: {}'.format(i, ' '.join(topic_words))) print('topics are extracting') print(topics_set) mycol.update_one({'_id': entry_id}, {'$set': { 'guided_lda_results': topics_set }}) print("Successfully extended the data entry with guided lda results", entry_id) except Exception: print("Vocabulary is empty") mycol.update_one({'_id': entry_id}, {'$set': { 'guided_lda_results': [] }}) return "No enough Data/Vocabulary is empty"
def get_entries_project(project_id): completed_count = [] incomplete_count = 0 incompletes = [] problems = [] all_entires = [] profile_col = refer_collection() projects_col = refer_projects_col() query_collection = refer_query_col() proj_data_entry = projects_col.find({"_id": project_id}) print('proj', proj_data_entry) proj_data = [i for i in proj_data_entry] print('data', len(proj_data)) proj_attribute_keys = list(proj_data[-1].keys()) if ('associated_queries' in proj_attribute_keys): associated_queries = proj_data[-1]['associated_queries'] for each_query in associated_queries: query_data_entry = query_collection.find( {"_id": ObjectId(each_query)}) query_data = [i for i in query_data_entry] print([ query_data[0]['search_query'], query_data[0]['state'], query_data[0]['_id'] ]) query_attribute_keys = list(query_data[0].keys()) if ('associated_entries' in query_attribute_keys): associated_entries = query_data[0]['associated_entries'] # print('kk',associated_entries) obs_ids = [ObjectId(i) for i in associated_entries] all_entires.extend(obs_ids) for k in obs_ids: prof_data_entry = profile_col.find({"_id": k}) # print('proj', proj_data_entry) prof_data = [i for i in prof_data_entry] prof_attribute_keys = list(prof_data[0].keys()) if ('simplified_dump_state' in prof_attribute_keys): if (prof_data[0]['simplified_dump_state'] == 'Completed'): completed_count.append(k) # else:print(prof_data[0]['simplified_dump_state']) elif (prof_data[0]['simplified_dump_state'] == 'Incomplete'): incomplete_count += 1 incompletes.append(k) else: problems.append(k) else: problems.append(k) # # print(['completed',completed_count,'all',len(obs_ids),'incompleted',incomplete_count,incompletes,'prob',problems]) # # filt = [] # # for k in obs_ids: # # if(k not in problems): # # filt.append(k) # # print('filt',filt) # if(completed_count==len(obs_ids)): # query_collection.update_one({'_id': ObjectId(each_query)}, {'$set': {'state': 'Completed'}}) # return obs_ids print('completed_count', len(list(set(completed_count)))) print('incomplete_count', incomplete_count) print('incompletes', list(set(incompletes))) print('problems', list(set(problems))) print('all', all_entires) return { 'incompletes': list(set(incompletes)), 'problems': list(set(problems)) } # all_entires = list(set(all_entires))m # return all_entires else: print("This project do not have any queries yet") return []
def repair_wanted_parts(entry_id_list): profile_col = refer_collection() for k in entry_id_list: print('*****************') prof_data_entry = profile_col.find({"_id": k}) # print('proj', proj_data_entry) prof_data = [i for i in prof_data_entry] prof_attribute_keys = list(prof_data[0].keys()) if ('deep_crawling_state' in prof_attribute_keys): print('yes') if (prof_data[0]['deep_crawling_state'] == 'Completed'): print('deep_crawling_state_already_done') else: add_to_deep_crawling_queue([k]) else: add_to_deep_crawling_queue([k]) if ('feature_extraction_state' in prof_attribute_keys): if (prof_data[0]['feature_extraction_state'] == 'Completed'): print('feature_extraction_state_already_done') else: add_to_deep_crawling_queue([k]) else: add_to_deep_crawling_queue([k]) # if ('classification_state' in prof_attribute_keys): if (prof_data[0]['classification_state'] == 'Completed'): print('classification_state_already_done') else: add_to_deep_crawling_queue([k]) else: add_to_deep_crawling_queue([k]) if ('owler_qa_state' in prof_attribute_keys): if (prof_data[0]['owler_qa_state'] == 'Completed'): print('owler_qa_state_already_done') else: print("Adding to Owler QA extraction queue") add_to_qa_queue([k]) else: print("Adding to Owler QA extraction queue") add_to_qa_queue([k]) if ('google_cp_state' in prof_attribute_keys): if (prof_data[0]['google_cp_state'] == 'Completed'): print('google_cp_state_already_done') else: print("Adding to google contact person extraction queue") add_to_cp_queue([k]) else: print("Adding to google contact person extraction queue") add_to_cp_queue([k]) if ('oc_extraction_state' in prof_attribute_keys): if (prof_data[0]['oc_extraction_state'] == 'Completed'): print('oc_extraction_state_already_done') else: print("Adding to Opencorporates extraction queue") add_to_oc_queue([k]) else: print("Adding to Opencorporates extraction queue") add_to_oc_queue([k]) if ('google_address_state' in prof_attribute_keys): if (prof_data[0]['google_address_state'] == 'Completed'): print('google_address_state_already_done') else: print("Adding to google address extraction queue") add_to_ad_queue([k]) else: print("Adding to google address extraction queue") add_to_ad_queue([k]) if ('dnb_extraction_state' in prof_attribute_keys): if (prof_data[0]['dnb_extraction_state'] == 'Completed'): print('dnb_extraction_state_already_done') else: print("Adding to DNB extraction queue") add_to_dnb_queue([k]) else: print("Adding to DNB extraction queue") add_to_dnb_queue([k]) # if ('google_tp_state' in prof_attribute_keys): if (prof_data[0]['google_tp_state'] == 'Completed'): print('google_tp_state_already_done') else: print("Adding to google tp extraction queue") add_to_tp_queue([k]) else: print("Adding to google tp extraction queue") add_to_tp_queue([k]) # # if ('crunchbase_extraction_state' in prof_attribute_keys): if (prof_data[0]['crunchbase_extraction_state'] == 'Completed'): print('crunchbase_extraction_state_already_done') else: print("Adding to Crunchbase extraction queue") add_to_cb_queue([k]) else: print("Adding to Crunchbase extraction queue") add_to_cb_queue([k]) # # if ('li_cp_state' in prof_attribute_keys): if (prof_data[0]['li_cp_state'] == 'Completed'): print('li_cp_state_already_done') else: print("Adding to linkedin cp extraction queue") add_to_li_cp_queue([k]) else: print("Adding to linkedin cp extraction queue") add_to_li_cp_queue([k]) # # # # if ('simplified_dump_state' in prof_attribute_keys): if (prof_data[0]['simplified_dump_state'] == 'Completed'): print('simplified_dump_state already_done') else: print("Adding to simplified dump queue") add_to_simplified_export_queue([k]) else: print("Adding to simplified dump queue") add_to_simplified_export_queue([k])
def execute_for_a_company(comp_name): mycol = refer_collection() print("Searching a company") dateTimeObj = datetime.now() query_collection = refer_query_col() data_q = {'started_time_stamp': dateTimeObj, 'search_query': comp_name} record_entry = query_collection.insert_one(data_q) print("Started on", dateTimeObj) started = time.time() print("***Initial Crawling Phrase***") entry_id = search_a_company(comp_name, mycol, record_entry.inserted_id) if (entry_id == None): for i in range(3): print("Initial crawling incomple..retrying", i) entry_id = search_a_company(comp_name, mycol, record_entry.inserted_id) time.sleep(5) if (entry_id != None): break if (entry_id == None): print("Initial crawling incomple..retrying unsuccessful") elif (entry_id == 'exist'): print("Existing profile found. pipeline exits") else: print("entry id received ", entry_id) print("***Deep Crawling Phrase***") deep_crawl([entry_id], 3, 100) print( "Deep crawling completed and record extended with crawled_links,header_text,paragraph_text,social_media_links,telephone numbers,emails,addresses" ) print("***Feature Extraction Phrase***") extract_features([entry_id]) print("***Contact Person Extraction Phrase***") extract_contact_persons([entry_id], 'comp') print(("***Predict the company type***")) predict_class_tags([entry_id]) print(("***Extract crunchbase profile data***")) get_cb_data([entry_id]) print(("***Extract linkedin profile data***")) get_li_data([entry_id]) print(("***Extract opencorporates profile data***")) get_oc_data([entry_id]) print(("***Extract dnb profile data***")) get_dnb_data([entry_id]) print("***Addresses from google***") get_ad_from_google([entry_id]) print("***Addresses extraction completed***") print("***cp from google***") get_cp_from_google([entry_id]) print("***cp extraction completed***") print("***phone numbers from google***") get_tp_from_google([entry_id]) print("***tp extraction completed***") print("***frequent questions google***") get_qa_from_google([entry_id]) print("***qa extraction completed***") print(("***Dumping the results***")) # export_profiles([entry_id],record_entry.inserted_id) simplified_export([entry_id]) ended = time.time() duration = ended - started dateTimeObj_e = datetime.now() completion_data = { 'completed_time_stamp': dateTimeObj_e, 'elapsed_time': duration } print(completion_data) query_collection.update_one({'_id': record_entry.inserted_id}, {'$set': completion_data}) print("Pipeline execution completed, elapsed time:", duration)