Exemplo n.º 1
0
def add_to_initial_crawling_queue(name_list):
    mycol = refer_collection()
    connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
    ic_client = QueueClient.from_connection_string(connect_str, "initial-crawling-queue")
    for name in name_list:
        print(name)
        ic_client.send_message([str(name)])
Exemplo n.º 2
0
def run_sequential_crawlers_m(id_list, depth_limit,
                              crawl_limit):  #method used to run the crawler
    """

    :param id_list: list of id in the database entries wanted to crawl further
    :param depth_limit: max depth want to crawl
    :param crawl_limit: max page count want to crawl
    :return:
    """
    mycol = refer_collection()
    # mycol = refer_cleaned_collection()
    for entry_id in id_list:  #going for n depth for the each google search result
        comp_data_entry = mycol.find({"_id": entry_id})
        data = [i for i in comp_data_entry]
        print(data)
        if (data[0]['link'] == 'None'): continue
        print(data[0]['link'])
        print("started", data[0]['_id'],
              data[0]['link'] + " scraping in to n depth")

        #configuring the crawlers
        # lists for collecting crawling data
        crawled_links = []
        header_text = []
        paragraph_text = []
        telephone_numbers = []
        emails = []
        social_media_links = []
        addresses = []
        allowed_domains = data[0]['link'].split("/")[
            2]  #getting allowed links from the starting urls itself
        print('allowed_dm', allowed_domains)

        custom_settings = {
            'DEPTH_LIMIT':  #setting depth limit of crawling
            str(depth_limit),
        }
        crawl_limit = crawl_limit  # setting crawl limit aka number of links going to crawl
        yield runner.crawl(NCrawlerSpider,
                           start_urls=[
                               data[0]['link'],
                           ],
                           allowed_domains=[
                               allowed_domains,
                           ],
                           custom_settings=custom_settings,
                           crawled_links=crawled_links,
                           header_text=header_text,
                           paragraph_text=paragraph_text,
                           telephone_numbers=telephone_numbers,
                           addresses=addresses,
                           social_media_links=social_media_links,
                           emails=emails,
                           crawl_limit=crawl_limit,
                           entry_id=entry_id)
        print("one done")
    # print("reactor is stopping")
    # reactor.callFromThread(reactor.stop)
    # print(' reactor stops',threading.currentThread().ident)
    reactor.stop()
Exemplo n.º 3
0
def get_cp_oc(entry_id,mode):
    # myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    # mydb = myclient["CompanyDatabase"]  # refer the database
    # mycol = mydb["comp_data"]  # refer the collection
    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    # comp_name = data[0]['search_text']
    try:
        if mode=='comp':
            comp_name = data[0]['search_text']
        elif mode == 'query':
            comp_name = data[0]['comp_name']
    except KeyError:
        comp_name = data[0]['link'].split("/")[2]

    det=[comp_name]
    sr = getGoogleLinksForSearchText(comp_name + " opencorporates", 3, 'normal')

    filtered_oc = []
    for p in sr:
        if (('opencorporates.com/companies/nz' in p['link']) or ('opencorporates.com/companies/au' in p['link'])):
            filtered_oc.append([p['title'], p['link']])
    if (len(filtered_oc)):
        print(filtered_oc[0])
        det.append(filtered_oc[0])
        det.append(scrape_opencorporates(filtered_oc[0][1]))
        print(det)
        mycol.update_one({'_id': entry_id},
                         {'$set': {'oc_cp_info': det}})
        print("Successfully extended the data entry with opencorporates contact person data", entry_id)
    else:
        print("No opencorporates profile found!, Try again")
        mycol.update_one({'_id': entry_id},
                         {'$set': {'oc_cp_info': det}})
Exemplo n.º 4
0
def get_li_url(entry_id):

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    try:
        sm_links = data[0]['social_media_links']
    except Exception:
        sm_links = []
    linked_in_comp_urls = []
    for each in sm_links:
        if('linkedin.com/company' in each):linked_in_comp_urls.append(each)
    if(len(linked_in_comp_urls)):
        print("Linkedin profile collected from crawled data")
        print("linkedin taken from crawling")
        return linked_in_comp_urls[0]
    else:
        comp_name = data[0]['comp_name']
        print(data[0]['comp_name'])
        sr = getGoogleLinksForSearchText( comp_name + " linkedin australia", 5, 'normal')
        if (len(sr) == 0):
            sr = getGoogleLinksForSearchText(comp_name + " linkedin australia", 5, 'normal')
            if (len(sr) == 0):
                sr = getGoogleLinksForSearchText(comp_name + " linkedin australia", 5, 'normal')

        filtered_li = []
        for p in sr:
            # print(p['link'])
            if 'linkedin.com/company' in p['link']:
                filtered_li.append(p['link'])
        if (len(filtered_li)):
            return filtered_li[0]
        else:
            print("No linkedin contacts found!, Try again")
            return False
def get_li_data(id_list):
    mycol = refer_collection()
    for entry_id in id_list:
        time.sleep(60)
        li_url = get_li_url(entry_id)
        if (li_url):
            print(li_url)
            # blockPrint()
            comp_li_data = scrape_company(li_url)
            # enablePrint()
            # print(comp_li_data)
            corrected_dict = {k + '_li': v for k, v in comp_li_data.items()}
            # corrected_dict['li_url']=li_url
            if ('headquarters_li' in corrected_dict.keys()):
                # print(corrected_dict['headquarters_li'])
                if (not isvalid_hq(corrected_dict['headquarters_li'])):
                    corrected_dict = {}

            # for k in corrected_dict:
            #     print("'"+str(k)+"'")
            print(corrected_dict)
            if (len(corrected_dict.keys())):
                corrected_dict['li_url'] = li_url
                mycol.update_one({'_id': entry_id}, {'$set': corrected_dict})
                print(
                    "Successfully extended the data entry with linkedin profile information",
                    entry_id)
            else:
                print("No correct linkedin profile found! dict is empty")
def run_mallet_model(
    entry_id, number_of_topics
):  #this will extract paragraph and header text from given json file and extract the topics from that

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("lda mallet model started", str(data[0]['_id']), data[0]['link'])
    print('Grabbing paragraph and header text from database...')
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  # do topic extraction on paragraph and header text
        # print(h_p_data)
        data_words = list(sent_to_words(h_p_data))
        # print("data_words",data_words)
        print('remove_punctuations...')
        # Remove Stop Words
        data_words_nostops = remove_stopwords(data_words)

        # Do lemmatization keeping only noun, adj, vb, adv
        data_lemmatized = lemmatization(
            data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
        # print('data_lemmatized...')
        # Create Dictionary
        id2word = corpora.Dictionary(data_lemmatized)
        # print('id2word',id2word)
        # Create Corpus
        texts = data_lemmatized
        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]
        # print('corpus',corpus)
        # View
        print('corpus is created')  #(word,frequency of occuring)
        topics = []
        mallet_list = []
        mallet_path = 'F:/Armitage_project/crawl_n_depth/utilities/new_mallet/mallet-2.0.8/bin/mallet'  # update this path
        ldamallet = gensim.models.wrappers.LdaMallet(
            mallet_path,
            corpus=corpus,
            num_topics=number_of_topics,
            id2word=id2word)
        print('topics are extracting')
        mallet_list = {
            'Topic_' + str(i):
            [word for word, prob in ldamallet.show_topic(i, topn=10)]
            for i in range(0, ldamallet.num_topics)
        }

        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'mallet_results': mallet_list
                         }})
        print(mallet_list)
        print("Successfully extended the data entry with mallet results",
              entry_id)

    except Exception:  #handling exceptions if corpus is empty
        print("corpus is empty or not valid/ mallet model cannot continue")
        mycol.update_one({'_id': entry_id}, {'$set': {'mallet_results': []}})
Exemplo n.º 7
0
def run_lda_model(
    entry_id, number_of_topics
):  #this will extract paragraph and header text from given json file and extract the topics from that

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  #do topic extraction on paragraph and header text
        print("lda model started " + str(data[0]['_id']), data[0]['link'])
        print('Grabbing paragraph and header text from database...')
        # print(h_p_data)
        data_words = list(sent_to_words(h_p_data))
        # print("data_words",data_words)
        print('remove_punctuations...')
        # Remove Stop Words
        data_words_nostops = remove_stopwords(data_words)

        # Do lemmatization keeping only noun, adj, vb, adv
        data_lemmatized = lemmatization(
            data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
        # print('data_lemmatized...')

        # Create Dictionary
        id2word = corpora.Dictionary(data_lemmatized)
        # Create Corpus
        texts = data_lemmatized
        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]
        # View
        print('corpus is created')  #(word,frequency of occuring)

        lda_model = gensim.models.ldamodel.LdaModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=number_of_topics,
            passes=5,
            alpha='auto')
        print('topics are extracting')
        # topics = lda_model.print_topics()
        words_list = {
            'Topic_' + str(i):
            [word for word, prob in lda_model.show_topic(i, topn=10)]
            for i in range(0, lda_model.num_topics)
        }
        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'lda_results': words_list
                         }})
        print("Successfully extended the data entry with lda results",
              entry_id)

    except Exception:  #handling exceptions if corpus is empty
        print("corpus is empty or not valid")
        mycol.update_one({'_id': entry_id}, {'$set': {'lda_results': []}})
Exemplo n.º 8
0
def run_textrank_model(
    entry_id, phrase_limit
):  # this will extract paragraph and header text from given json file and extract the topics from that
    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("textrank model started", str(data[0]['_id']), data[0]['link'])
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        combined_text = " ".join(h_p_data)

        # load a spaCy model, depending on language, scale, etc.
        nlp = spacy.load("en_core_web_sm")

        # add PyTextRank to the spaCy pipeline
        tr = pytextrank.TextRank()
        nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
        nlp.max_length = 150000000
        doc = nlp(combined_text)

        # examine the top-ranked phrases in the document
        tr_results = []
        tr_words = []
        for p in doc._.phrases[:phrase_limit]:
            tr_results.append([p.rank, p.count, p.text])
            tr_words.append(p.text)
            # print(p.chunks)
        # summery_res = []
        # for sent in doc._.textrank.summary(limit_sentences=summery_limit):
        #     print(sent)
        #     summery_res.append(str(sent))
        # print(summery_res)
        if (len(tr_words)):
            print(tr_words)
            mycol.update_one({'_id': entry_id},
                             {'$set': {
                                 'textrank_results': tr_words
                             }})
            print("Successfully extended the data entry with textrank results",
                  entry_id)
        else:
            mycol.update_one({'_id': entry_id},
                             {'$set': {
                                 'textrank_results': []
                             }})
            print("vocabulary is empty")
    except Exception:
        mycol.update_one({'_id': entry_id}, {'$set': {'textrank_results': []}})
        print("vocabulary is empty")


# run_textrank_model("F://Armitage_project//crawl_n_depth//extracted_json_files//0_www.sureway.com.au_data.json",50,5)
Exemplo n.º 9
0
def query_state_update_via_queue():
    print("Query state updating queue is live")
    query_collection = refer_query_col()
    mycol = refer_collection()

    connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
    query_client = QueueClient.from_connection_string(connect_str,
                                                      "query-queue")

    while (True):
        # print('q')
        time.sleep(200)
        rows = query_client.receive_messages()
        for msg in rows:
            time.sleep(10)

            row = msg.content
            print(row)
            row = ast.literal_eval(row)
            print('getting_id', row[0])
            entry_id = ObjectId(row[0])
            query_data_entry = query_collection.find({"_id": entry_id})
            data = [i for i in query_data_entry]
            #check_for_the_completion_of_components
            try:
                associated_entries = data[0]['associated_entries']
                print('getting associated entries')
                completed_count = 0
                for each_entry_res in associated_entries:
                    res_entry = mycol.find({"_id": each_entry_res})
                    # print("profile",each_entry_res)
                    data_res = [i for i in res_entry]
                    if (data_res[0]['simplified_dump_state'] == 'Completed'):
                        completed_count += 1

                print('completed_count', completed_count)
                print('entry_count', data[0]['entry_count'])

                if (completed_count == data[0]['entry_count']):
                    print("All the entries are completed for the query",
                          completed_count)
                    query_collection.update_one(
                        {'_id': entry_id}, {'$set': {
                            'state': 'Completed'
                        }})
                    query_client.delete_message(msg)

            except KeyError as e:
                print('Query is not yet ready', e)
            except IndexError as e:
                print('Yet query entry not available')
            except Exception as e:
                print("Exception Occured during dumping ", e)
Exemplo n.º 10
0
def process_data_m(id_list, mode):
    tagged_data = []
    mycol = refer_collection()
    for entry_id in id_list:
        comp_data_entry = mycol.find({"_id": entry_id})
        data_o = [i for i in comp_data_entry]
        # print(data_o)
        try:
            if (mode == 'test'):
                class_tag = data_o[0]['comp_name']
            if (mode == 'train'):
                class_tag = data_o[0]['industry']

            word_cloud_results = data_o[0]['wordcloud_results_tri']
            word_cloud_tokens = [term[0] for term in word_cloud_results]
            # print(word_cloud_tokens)

            text_rank_tokens = data_o[0]["textrank_results"]
            # print(text_rank_tokens)

            title = data_o[0]["title"].split(" ")

            meta_description = data_o[0]["description"].split(" ")

            guided_lda_res = data_o[0]["guided_lda_results"]
            guided_lda_tokens = [j for i in guided_lda_res for j in i]
            # print(guided_lda_tokens)

            lda_topics = data_o[0]["lda_results"]
            lda_tokens = []
            for eac_re in lda_topics:
                lda_tokens = lda_tokens + lda_topics[eac_re]
            # print(lda_tokens)

            kpe_tokens = word_cloud_results = data_o[0]['kpe_results']
            # print(kpe_tokens)

            token_set = lda_tokens + word_cloud_tokens + text_rank_tokens + title + meta_description + guided_lda_tokens + kpe_tokens  # combining all features
            # name_set.append(path_f)
            tagged_data.append([
                data_o[0]["_id"],
                TaggedDocument(words=token_set, tags=[class_tag])
            ])

        except KeyError:
            print("prediction is skipped as features no present, entry id ",
                  entry_id)
            # print(KeyError)
            continue
    return tagged_data
Exemplo n.º 11
0
def run_rake_model(entry_id, rake_limit):
    from nltk.corpus import stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("rake model started", str(data[0]['_id']), data[0]['link'])
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        combined_text = " ".join(h_p_data)

        r = Rake(max_length=3,
                 min_length=1,
                 ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)
        r.extract_keywords_from_text(combined_text)
        res_words = r.get_ranked_phrases()
        if (len(res_words)):
            print(res_words[:rake_limit])
            mycol.update_one(
                {'_id': entry_id},
                {'$set': {
                    'rake_results': res_words[:rake_limit]
                }})
            print("Successfully extended the data entry with rake results",
                  entry_id)
        else:
            mycol.update_one({'_id': entry_id}, {'$set': {'rake_results': []}})
            print("vocabulary is empty")

    except Exception:
        mycol.update_one({'_id': entry_id}, {'$set': {'rake_results': []}})
        print("vocabulary is empty")


# run_rake_model("F://Armitage_project/crawl_n_depth/extracted_json_files/www.axcelerate.com.au_0_data.json",50)
def grab_rich_description(id_list):
    mycol = refer_collection()
    for entry_id in id_list:
        try:
            rich_description = 'None'
            # time.sleep(120)
            print(entry_id)
            comp_data_entry = mycol.find({"_id": entry_id})
            data = [i for i in comp_data_entry]
            link = data[0]['link']
            print(link)
            browser = use_chrome()
            browser.get(link)
            time.sleep(5)
            pageSource = browser.page_source
            browser.quit()
            # browser.close()
            soup = BeautifulSoup(pageSource, 'html.parser')
            metas = soup.find_all('meta')
            # print(metas)
            meta_description = [meta.attrs['content'] for meta in metas if
                                'name' in meta.attrs and meta.attrs['name'] == 'description']
            og_description = [meta.attrs['content'] for meta in metas if
                              'property' in meta.attrs and meta.attrs['property'] == 'og:description']
            # twitter_description =  [meta.attrs['content'] for meta in metas if 'name' in meta.attrs and meta.attrs['name'] == 'twitter:description']
            if (meta_description != og_description):
                rich_description = meta_description + og_description
            else:
                rich_description = meta_description

            rich_description = '_'.join(rich_description)
            rich_description = rich_description.replace(',', ' ')
            print('***', rich_description)

            mycol.update_one({'_id': entry_id},
                             {'$set': {'rich_description':rich_description}})
        except Exception:
            mycol.update_one({'_id': entry_id},
                             {'$set': {'rich_description': 'None'}})
Exemplo n.º 13
0
def get_li_data_ano(id_list):
    mycol = refer_collection()
    for entry_id in id_list:
        # time.sleep(120)
        print(entry_id)
        li_url = get_li_url(entry_id)
        if (li_url):
            print(li_url)
            # blockPrint()
            comp_li_data = scrape_li_ano(li_url)
            # enablePrint()
            # print(comp_li_data)
            corrected_dict = comp_li_data
            # for k in corrected_dict:
            #     print("'"+str(k)+"'")
            # print(corrected_dict)
            if (len(corrected_dict.keys())):
                mycol.update_one({'_id': entry_id}, {'$set': corrected_dict})
                print(
                    "Successfully extended the data entry with linkedin profile information",
                    entry_id)
            else:
                print("No linkedin profile found! dict is empty")
Exemplo n.º 14
0
def get_cp_dnb(entry_id,mode):
    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    # print(comp_data_entry)
    data = [i for i in comp_data_entry]
    # comp_name = data[0]['search_text']
    # print(data)
    try:
        if mode=='comp':
            comp_name = data[0]['search_text']
        elif mode == 'query':
            print(data)
            comp_name = data[0]['comp_name']
    except KeyError:
        comp_name = data[0]['link'].split("/")[2]
    det = [comp_name]
    sr = getGoogleLinksForSearchText(comp_name + " dnb.com", 3, 'normal')
    filtered_dnb = []
    for p in sr:
        if 'dnb.com/business-directory/company-profiles' in p['link']:
            filtered_dnb.append([p['title'], p['link']])
    if (len(filtered_dnb)):
        print("dnb profile found and extracting contact persons..")
        print(filtered_dnb[0])
        det.append(filtered_dnb[0])
        print(filtered_dnb[0][1])
        det.append(scrape_dnb(filtered_dnb[0][1]))
        print(det)
        mycol.update_one({'_id': entry_id},
                         {'$set': {'dnb_cp_info': det}})
        print("Successfully extended the data entry with dnb contact person data", entry_id)
    else:
        print("No dnb profile found!,Try again..")
        print(det)
        mycol.update_one({'_id': entry_id},
                         {'$set': {'dnb_cp_info': det}})
Exemplo n.º 15
0
def fix_entry_counts():
    print("Query state updating queue is live")
    query_collection = refer_query_col()
    mycol = refer_collection()

    connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
    query_client = QueueClient.from_connection_string(connect_str,
                                                      "query-queue")
    rows = query_client.receive_messages()
    for msg in rows:
        # time.sleep(10)

        row = msg.content
        print(row)
        row = ast.literal_eval(row)
        print('getting_id', row[0])
        entry_id = ObjectId(row[0])
        query_data_entry = query_collection.find({"_id": entry_id})
        data = [i for i in query_data_entry]
        # check_for_the_completion_of_components

        associated_entries = data[0]['associated_entries']
        print('getting associated entries')
        print(len(associated_entries))
Exemplo n.º 16
0
def key_phrase_extract(
    entry_id, number_of_candidates
):  #this will extract paragraph and header text from given json file and extract the topics from that
    extractor = TopicRank()

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("key_phrase extraction started", str(data[0]['_id']),
          data[0]['link'])
    try:
        h_data = data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        data_hp = " ".join(h_data)
        with open(
                'temp_text.txt', 'w', encoding='utf-8'
        ) as f:  #write the extracted header and paragraph text to .txt as this lib_guidedlda only accepts .txt files
            f.write(data_hp)
            f.close()

        extractor.load_document(
            input='temp_text.txt',
            language="en",
            max_length=10000000,  #load text file
            normalization='stemming')
        #get stop words list
        stoplist = stopwords.words('english')

        # select the keyphrase candidates, for TopicRank the longest sequences of
        # nouns and adjectives
        extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'},
                                      stoplist=stoplist)

        # weight the candidates using a random walk. The threshold parameter sets the
        # minimum similarity for clustering, and the method parameter defines the
        # linkage method

        extractor.candidate_weighting(threshold=0.74, method='average')

        # print the n-highest (10) scored candidates
        kpe_results = []
        for (keyphrase, score) in extractor.get_n_best(n=number_of_candidates,
                                                       stemming=True):
            kpe_results.append([keyphrase, score])
        print("key phrase extraction completed")
        # print(kpe_results)
        kpe_words = [i[0] for i in kpe_results]
        # print(kpe_words)
        print(kpe_words)
        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'kpe_results': kpe_words
                         }})
        print("Successfully extended the data entry with kpe results",
              entry_id)

    except Exception:  #handling exceptions if corpus is empty
        print("Observations set is empty or not valid")
        mycol.update_one({'_id': entry_id}, {'$set': {'kpe_results': []}})
        return "Observations set is empty or not valid"
Exemplo n.º 17
0
def run_sequential_crawlers_m_via_queue_chain(
        depth_limit, crawl_limit):  #method used to run the crawler
    """
    :param depth_limit: max depth want to crawl
    :param crawl_limit: max page count want to crawl
    :return:
    """
    try:
        mycol = refer_collection()
        connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
        deep_crawling_client = QueueClient.from_connection_string(
            connect_str, "deep-crawling-queue")
        f_e_client = QueueClient.from_connection_string(
            connect_str, "feature-extraction-queue")
        # mycol = refer_cleaned_collection()
        while (True):
            rows = deep_crawling_client.receive_messages()
            for msg in rows:
                # time.sleep(120)
                row = msg.content
                row = ast.literal_eval(row)
                print(row[0])
                entry_id = ObjectId(row[0])
                comp_data_entry = mycol.find({"_id": entry_id})
                data = [i for i in comp_data_entry]
                print(data)
                if (data[0]['link'] == 'None'): continue
                print(data[0]['link'])
                print("started", data[0]['_id'],
                      data[0]['link'] + " scraping in to n depth")

                #configuring the crawlers
                # lists for collecting crawling data
                crawled_links = []
                header_text = []
                paragraph_text = []
                telephone_numbers = []
                emails = []
                social_media_links = []
                addresses = []
                allowed_domains = data[0]['link'].split("/")[
                    2]  #getting allowed links from the starting urls itself
                print('allowed_dm', allowed_domains)

                custom_settings = {
                    'DEPTH_LIMIT':  #setting depth limit of crawling
                    str(depth_limit),
                }
                crawl_limit = crawl_limit  # setting crawl limit aka number of links going to crawl
                yield runner.crawl(NCrawlerSpider,
                                   start_urls=[
                                       data[0]['link'],
                                   ],
                                   allowed_domains=[
                                       allowed_domains,
                                   ],
                                   custom_settings=custom_settings,
                                   crawled_links=crawled_links,
                                   header_text=header_text,
                                   paragraph_text=paragraph_text,
                                   telephone_numbers=telephone_numbers,
                                   addresses=addresses,
                                   social_media_links=social_media_links,
                                   emails=emails,
                                   crawl_limit=crawl_limit,
                                   entry_id=entry_id)
                print("completed_crawling and message removing from the queue")
                deep_crawling_client.delete_message(msg)
                mycol.update_one(
                    {'_id': entry_id},
                    {'$set': {
                        'deep_crawling_state': 'Completed'
                    }})

                if (len(paragraph_text) == 0):
                    get_cp_page_data([entry_id])
                print("Adding to feature extraction queue")
                f_e_client.send_message([str(entry_id)], time_to_live=-1)
                mycol.update_one(
                    {'_id': entry_id},
                    {'$set': {
                        'feature_extraction_state': 'Incomplete'
                    }})
        # print("reactor is stopping")
        # reactor.callFromThread(reactor.stop)
        # print(' reactor stops',threading.currentThread().ident)
        reactor.stop()
    except Exception as e:
        print("Error has occured..try again!", e)
Exemplo n.º 18
0
    p8 = Process(target=get_aven_data_via_queue)
    p8.start()
    p9 = Process(target=get_ad_from_google_via_queue)
    p9.start()
    p10 = Process(target=get_dnb_data_via_queue)
    p10.start()
    p11 = Process(target=get_tp_from_google_via_queue)
    p11.start()
    p12 = Process(target=get_cb_data_via_queue)
    p12.start()
    p13 = Process(target=simplified_export_via_queue)
    p13.start()

    connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
    ic_client = QueueClient.from_connection_string(connect_str, "initial-crawling-queue")
    mycol = refer_collection()
    while (True):
        rows = ic_client.receive_messages()
        for msg in rows:
            # time.sleep(120)
            row = msg.content
            row = ast.literal_eval(row)
            print(row[0])

            input_d = row[0].split("--")
            try:
                mode = input_d[1]
                s_text = input_d[0]
                if mode == 'query':
                    query = s_text.strip()
                    print("Searching a query")
Exemplo n.º 19
0
    def spider_closed(
            self,
            spider):  #once spider done with crawling dump data to json files
        spider.logger.info('Spider closed dumping data: %s', spider.name)
        print('spider is closed dumping data.....')
        #remove duplicates
        self.crawled_links = list(set(self.crawled_links))
        self.header_text = list(set(self.header_text))
        self.paragraph_text = list(set(self.paragraph_text))
        self.emails = list(set(self.emails))
        self.addresses = list(set(self.addresses))
        # print("old",self.addresses)
        self.addresses = getting_uniques(self.addresses)
        self.social_media_links = list(set(self.social_media_links))
        self.telephone_numbers = list(set(self.telephone_numbers))
        # print(self.social_media_links)

        fixed_addresses_with_sources = []
        for ad in self.addresses:
            for ad_s in self.addresses_with_links:
                if (ad in ad_s[0]):
                    # print(ad, ad_s[1])
                    fixed_addresses_with_sources.append([ad, ad_s[1]])
                    break
        fixed_tp_with_sources = []
        for tp in self.telephone_numbers:
            for tp_s in self.telephone_numbers_with_links:
                if (tp in tp_s[0]):
                    # print(ad, ad_s[1])
                    fixed_tp_with_sources.append([tp, tp_s[1]])
                    break

        fixed_sm_with_sources = []
        for sm in self.social_media_links:
            for sm_s in self.social_media_links_with_links:
                if (sm in sm_s[0]):
                    # print(sm, sm_s[1])
                    fixed_sm_with_sources.append([sm, sm_s[1]])
                    break

        fixed_em_with_sources = []
        for em in self.emails:
            for em_s in self.emails_with_links:
                if (em in em_s[0]):
                    # print(ad, ad_s[1])
                    fixed_em_with_sources.append([em, em_s[1]])
                    break

        fixed_ht_with_sources = []
        for ht in self.header_text:
            for ht_s in self.header_text_with_links:
                if (ht in ht_s[0]):
                    # print(ad, ad_s[1])
                    fixed_ht_with_sources.append([ht, ht_s[1]])
                    break

        fixed_pt_with_sources = []
        for pt in self.paragraph_text:
            for pt_s in self.paragraph_text_with_links:
                if (pt in pt_s[0]):
                    # print(ad, ad_s[1])
                    fixed_pt_with_sources.append([pt, pt_s[1]])
                    break
        print("with_links", self.addresses_with_links)
        print("fixed", fixed_addresses_with_sources)
        n_depth_data = {
            'crawled_links': self.crawled_links,
            'header_text': self.header_text,
            'paragraph_text': self.paragraph_text,
            'emails': self.emails,
            'addresses': self.addresses,
            'social_media_links': self.social_media_links,
            'telephone_numbers': self.telephone_numbers,
            'website_addresses_with_sources': fixed_addresses_with_sources,
            'header_text_with_sources': fixed_ht_with_sources,
            'paragraph_text_with_sources': fixed_pt_with_sources,
            'emails_with_sources': fixed_em_with_sources,
            'social_media_links_with_sources': fixed_sm_with_sources,
            'telephone_numbers_with_sources': fixed_tp_with_sources,
        }
        # print("size", len(self.paragraph_text))
        # print("address",self.addresses)
        # print("addresses_with_links",fixed_addresses_with_sources)

        # for i in self.addresses:
        #     for j in self.addresses:
        #         if
        # print("telephone", self.telephone_numbers)
        try:
            mycol = refer_collection()
            mycol.update_one({'_id': self.entry_id}, {'$set': n_depth_data})

            print("Successfully extended the data entry", self.entry_id)
        except Exception:
            print("Max document size reached..data is truncated!")
            n_depth_data['paragraph_text'] = self.paragraph_text[:5000]
            mycol = refer_collection()
            mycol.update_one({'_id': self.entry_id}, {'$set': n_depth_data})

            print("Successfully extended the data entry", self.entry_id)
Exemplo n.º 20
0
def run_wordcloud_model(
    entry_id, mode
):  # this will extract paragraph and header text from given json file and extract the topics from that

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("wordcloud model started", str(data[0]['_id']), data[0]['link'])
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        wordcloud = WordCloud(background_color="white",
                              max_words=100,
                              contour_width=3,
                              contour_color='steelblue')
        # Generate a word cloud
        data_words = list(sent_to_words(h_p_data))
        # data_words_nostops = remove_stopwords(data_words)
        data_lemmatized = lemmatization(data_words,
                                        allowed_postags=['NOUN', 'ADJ'])
        # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])
        # print(data_lemmatized)
        all_tokens = [j for i in data_lemmatized for j in i]
        # print('all', all_tokens)
        all_tokens = [
            value for value in all_tokens
            if (value != 'other' and value != 'day' and value != 'thing'
                and value != 'last')
        ]

        if mode == 'single':
            combined_text = " ".join(all_tokens)
        else:
            if mode == 'bi':
                finder = BigramCollocationFinder.from_words(all_tokens)
                bigram_measures = BigramAssocMeasures()
                scored = finder.score_ngrams(bigram_measures.raw_freq)
            if mode == 'tri':
                # print(combined_text)
                # setup and score the bigrams using the raw frequency.
                finder = TrigramCollocationFinder.from_words(all_tokens)
                trigram_measures = TrigramAssocMeasures()
                scored = finder.score_ngrams(trigram_measures.raw_freq)

            # print(scored)

            # By default finder.score_ngrams is sorted, however don't rely on this default behavior.
            # Sort highest to lowest based on the score.
            scoredList = sorted(scored, key=itemgetter(1), reverse=True)
            # print('sclist',scoredList)
            # word_dict is the dictionary we'll use for the word cloud.
            # Load dictionary with the FOR loop below.
            # The dictionary will look like this with the bigram and the score from above.
            # word_dict = {'bigram A': 0.000697411,
            #             'bigram B': 0.000524882}

            word_dict = {}
            listLen = len(scoredList)
            # Get the bigram and make a contiguous string for the dictionary key.
            # Set the key to the scored value.
            for i in range(listLen - 1):
                word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]
            # print('dic',word_dict)

        if mode == 'single':
            wordcloud.generate(combined_text)
        else:
            wordcloud.generate_from_frequencies(word_dict)
    except Exception:
        print("cannot make word cloud for empty text")
        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'wordcloud_results_' + mode: []
                         }})
        print("vocabulary is empty")
        return "Vocabulary is empty"

    # Visualize the word cloud
    wordcloud.to_image()
    wordcloud_words = []

    word_cloud_results = []
    for each_res in wordcloud.words_:

        word_cloud_results.append([each_res, wordcloud.words_[each_res]])
        wordcloud_words.append(each_res)
    # print('words', wordcloud_words)
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off")
    # plt.savefig(name_j[:-4]+"png")
    # plt.show()
    print(word_cloud_results)
    # return wordcloud_words

    mycol.update_one(
        {'_id': entry_id},
        {'$set': {
            'wordcloud_results_' + mode: word_cloud_results
        }})
    print("Successfully extended the data entry with wordcloud results",
          entry_id)


# run_wordcloud_model("F://Armitage_project//crawl_n_depth//extracted_json_files//3_www.hydroterra.com.au_data.json")
Exemplo n.º 21
0
def get_missed_to_csv(prof_list):
    with open('EDU_deep_keywords.csv', mode='w', encoding='utf8',
              newline='') as results_file:
        results_writer = csv.writer(results_file,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
        results_writer.writerow([
            'link', 'title', 'description', 'rake_results', 'textrank_results',
            'kpe_results', 'wordcloud_results', 'IsPrivateEquity'
        ])
        for k in prof_list:
            search_t = k + ' Australia'
            mycol = refer_collection()
            entry = mycol.find({"search_text": search_t})
            data = [d for d in entry]
            iseq = False
            eq_t = [
                'Investor', 'Invested', 'Private Equity', 'Acquired',
                'Allocated capital'
            ]
            # print(data)a
            try:
                all_text = (',').join(data[0]['paragraph_text']) + (',').join(
                    data[0]['header_text'])
                for k in eq_t:
                    if k in all_text:
                        iseq = True
                        break
                row = [
                    data[0]['link'], data[0]['title'], data[0]['description'],
                    data[0]['rake_results'][:10],
                    data[0]['textrank_results'][:10],
                    data[0]['kpe_results'][:10],
                    [k[0] for k in data[0]['wordcloud_results_tri'][:10]], iseq
                ]
            except Exception as e:
                print(k)
                p = k.split('//')[1].replace('www.', '')
                c_name = p.split('.com')[0]

                entry = mycol.find({"comp_name": c_name})
                data = [d for d in entry]
                try:
                    all_text = (',').join(data[0]['paragraph_text']) + (
                        ',').join(data[0]['header_text'])
                    for k in eq_t:
                        if k in all_text:
                            iseq = True
                            break
                    row = [
                        data[0]['link'], data[0]['title'],
                        data[0]['description'], data[0]['rake_results'][:10],
                        data[0]['textrank_results'][:10],
                        data[0]['kpe_results'][:10],
                        [k[0]
                         for k in data[0]['wordcloud_results_tri'][:10]], iseq
                    ]
                except Exception as e:
                    print(k, "******")
                    continue
                # q = k + ' --comp'
                # add_to_initial_crawling_queue([q])
            results_writer.writerow(row)
    results_file.close()


# get_missed_to_csv(missed_list)
Exemplo n.º 22
0
def run_guided_lda_model(
    entry_id, number_of_topics
):  #this will extract paragraph and header text from given json file and extract the topics from that
    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("guided lda model started", str(data[0]['_id']), data[0]['link'])
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        combined_text = " ".join(h_p_data)
        doc_list = [combined_text]

        token_vectorizer = CountVectorizer(doc_list, stop_words=stop_words)

        X = token_vectorizer.fit_transform(doc_list)
        tf_feature_names = token_vectorizer.get_feature_names()
        word2id = dict((v, idx) for idx, v in enumerate(tf_feature_names))

        seed_topic_list = [[
            'about', 'vision', 'mission', 'approach', 'team', 'clients'
        ], ['course', 'service', 'work', 'task'],
                           [
                               'address', 'australia', 'contact', 'email',
                               'location', 'call', 'social'
                           ]]
        number_of_topics = number_of_topics

        model = guidedlda.GuidedLDA(n_topics=number_of_topics,
                                    n_iter=100,
                                    random_state=7,
                                    refresh=20)
        seed_topics = {}

        for t_id, st in enumerate(seed_topic_list):
            for word in st:
                if (word in word2id.keys()):
                    seed_topics[word2id[word]] = t_id
                else:
                    try:
                        word2id[word] = str(int(list(word2id.keys())[-1]) + 1)
                        seed_topics[word2id[word]] = t_id
                    except ValueError:
                        pass

        model.fit(X, seed_topics=seed_topics, seed_confidence=0.35)

        n_top_words = 10
        topic_word = model.topic_word_
        topics_set = []
        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(tf_feature_names)[np.argsort(
                topic_dist)][:-(n_top_words + 1):-1]
            words = [w for w in topic_words]
            topics_set.append(words)
            # print('Topic {}: {}'.format(i, ' '.join(topic_words)))
        print('topics are extracting')
        print(topics_set)
        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'guided_lda_results': topics_set
                         }})
        print("Successfully extended the data entry with guided lda results",
              entry_id)
    except Exception:
        print("Vocabulary is empty")
        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'guided_lda_results': []
                         }})
        return "No enough Data/Vocabulary is empty"
Exemplo n.º 23
0
def get_entries_project(project_id):
    completed_count = []
    incomplete_count = 0
    incompletes = []
    problems = []
    all_entires = []
    profile_col = refer_collection()
    projects_col = refer_projects_col()
    query_collection = refer_query_col()
    proj_data_entry = projects_col.find({"_id": project_id})
    print('proj', proj_data_entry)
    proj_data = [i for i in proj_data_entry]
    print('data', len(proj_data))
    proj_attribute_keys = list(proj_data[-1].keys())
    if ('associated_queries' in proj_attribute_keys):
        associated_queries = proj_data[-1]['associated_queries']
        for each_query in associated_queries:
            query_data_entry = query_collection.find(
                {"_id": ObjectId(each_query)})
            query_data = [i for i in query_data_entry]
            print([
                query_data[0]['search_query'], query_data[0]['state'],
                query_data[0]['_id']
            ])
            query_attribute_keys = list(query_data[0].keys())
            if ('associated_entries' in query_attribute_keys):
                associated_entries = query_data[0]['associated_entries']
                # print('kk',associated_entries)
                obs_ids = [ObjectId(i) for i in associated_entries]
                all_entires.extend(obs_ids)

                for k in obs_ids:
                    prof_data_entry = profile_col.find({"_id": k})
                    # print('proj', proj_data_entry)
                    prof_data = [i for i in prof_data_entry]
                    prof_attribute_keys = list(prof_data[0].keys())

                    if ('simplified_dump_state' in prof_attribute_keys):
                        if (prof_data[0]['simplified_dump_state'] ==
                                'Completed'):
                            completed_count.append(k)
                        # else:print(prof_data[0]['simplified_dump_state'])
                        elif (prof_data[0]['simplified_dump_state'] ==
                              'Incomplete'):
                            incomplete_count += 1
                            incompletes.append(k)
                        else:
                            problems.append(k)
                    else:
                        problems.append(k)
                #
                # print(['completed',completed_count,'all',len(obs_ids),'incompleted',incomplete_count,incompletes,'prob',problems])
                # # filt = []
                # # for k in obs_ids:
                # #     if(k not in problems):
                # #         filt.append(k)
                # # print('filt',filt)
                # if(completed_count==len(obs_ids)):
                #     query_collection.update_one({'_id': ObjectId(each_query)}, {'$set': {'state': 'Completed'}})

                # return obs_ids

        print('completed_count', len(list(set(completed_count))))
        print('incomplete_count', incomplete_count)
        print('incompletes', list(set(incompletes)))
        print('problems', list(set(problems)))
        print('all', all_entires)
        return {
            'incompletes': list(set(incompletes)),
            'problems': list(set(problems))
        }
        # all_entires = list(set(all_entires))m
        # return all_entires
    else:
        print("This project do not have any queries yet")
        return []
Exemplo n.º 24
0
def repair_wanted_parts(entry_id_list):
    profile_col = refer_collection()
    for k in entry_id_list:
        print('*****************')
        prof_data_entry = profile_col.find({"_id": k})
        # print('proj', proj_data_entry)
        prof_data = [i for i in prof_data_entry]
        prof_attribute_keys = list(prof_data[0].keys())
        if ('deep_crawling_state' in prof_attribute_keys):
            print('yes')
            if (prof_data[0]['deep_crawling_state'] == 'Completed'):
                print('deep_crawling_state_already_done')
            else:
                add_to_deep_crawling_queue([k])
        else:
            add_to_deep_crawling_queue([k])
        if ('feature_extraction_state' in prof_attribute_keys):
            if (prof_data[0]['feature_extraction_state'] == 'Completed'):
                print('feature_extraction_state_already_done')
            else:
                add_to_deep_crawling_queue([k])
        else:
            add_to_deep_crawling_queue([k])
        #
        if ('classification_state' in prof_attribute_keys):
            if (prof_data[0]['classification_state'] == 'Completed'):
                print('classification_state_already_done')
            else:
                add_to_deep_crawling_queue([k])
        else:
            add_to_deep_crawling_queue([k])

        if ('owler_qa_state' in prof_attribute_keys):
            if (prof_data[0]['owler_qa_state'] == 'Completed'):
                print('owler_qa_state_already_done')
            else:
                print("Adding to Owler QA extraction queue")
                add_to_qa_queue([k])
        else:
            print("Adding to Owler QA extraction queue")
            add_to_qa_queue([k])

        if ('google_cp_state' in prof_attribute_keys):
            if (prof_data[0]['google_cp_state'] == 'Completed'):
                print('google_cp_state_already_done')
            else:
                print("Adding to google contact person extraction queue")
                add_to_cp_queue([k])
        else:
            print("Adding to google contact person extraction queue")
            add_to_cp_queue([k])

        if ('oc_extraction_state' in prof_attribute_keys):
            if (prof_data[0]['oc_extraction_state'] == 'Completed'):
                print('oc_extraction_state_already_done')
            else:
                print("Adding to Opencorporates extraction queue")
                add_to_oc_queue([k])
        else:
            print("Adding to Opencorporates extraction queue")
            add_to_oc_queue([k])

        if ('google_address_state' in prof_attribute_keys):
            if (prof_data[0]['google_address_state'] == 'Completed'):
                print('google_address_state_already_done')
            else:
                print("Adding to google address extraction queue")
                add_to_ad_queue([k])
        else:
            print("Adding to google address extraction queue")
            add_to_ad_queue([k])

        if ('dnb_extraction_state' in prof_attribute_keys):
            if (prof_data[0]['dnb_extraction_state'] == 'Completed'):
                print('dnb_extraction_state_already_done')
            else:
                print("Adding to DNB extraction queue")
                add_to_dnb_queue([k])
        else:
            print("Adding to DNB extraction queue")
            add_to_dnb_queue([k])

        #
        if ('google_tp_state' in prof_attribute_keys):
            if (prof_data[0]['google_tp_state'] == 'Completed'):
                print('google_tp_state_already_done')
            else:
                print("Adding to google tp extraction queue")
                add_to_tp_queue([k])
        else:
            print("Adding to google tp extraction queue")
            add_to_tp_queue([k])
        #
        #
        if ('crunchbase_extraction_state' in prof_attribute_keys):
            if (prof_data[0]['crunchbase_extraction_state'] == 'Completed'):
                print('crunchbase_extraction_state_already_done')
            else:
                print("Adding to Crunchbase extraction queue")
                add_to_cb_queue([k])
        else:
            print("Adding to Crunchbase extraction queue")
            add_to_cb_queue([k])
        #
        #
        if ('li_cp_state' in prof_attribute_keys):
            if (prof_data[0]['li_cp_state'] == 'Completed'):
                print('li_cp_state_already_done')
            else:
                print("Adding to linkedin cp extraction queue")
                add_to_li_cp_queue([k])
        else:
            print("Adding to linkedin cp extraction queue")
            add_to_li_cp_queue([k])
        #
        #
        #
        #
        if ('simplified_dump_state' in prof_attribute_keys):
            if (prof_data[0]['simplified_dump_state'] == 'Completed'):
                print('simplified_dump_state already_done')
            else:
                print("Adding to simplified dump queue")
                add_to_simplified_export_queue([k])
        else:
            print("Adding to simplified dump queue")
            add_to_simplified_export_queue([k])
Exemplo n.º 25
0
def execute_for_a_company(comp_name):
    mycol = refer_collection()
    print("Searching a company")
    dateTimeObj = datetime.now()
    query_collection = refer_query_col()
    data_q = {'started_time_stamp': dateTimeObj, 'search_query': comp_name}
    record_entry = query_collection.insert_one(data_q)
    print("Started on", dateTimeObj)
    started = time.time()
    print("***Initial Crawling Phrase***")
    entry_id = search_a_company(comp_name, mycol, record_entry.inserted_id)
    if (entry_id == None):
        for i in range(3):
            print("Initial crawling incomple..retrying", i)
            entry_id = search_a_company(comp_name, mycol,
                                        record_entry.inserted_id)
            time.sleep(5)
            if (entry_id != None): break
    if (entry_id == None):
        print("Initial crawling incomple..retrying unsuccessful")
    elif (entry_id == 'exist'):
        print("Existing profile found. pipeline exits")
    else:
        print("entry id received ", entry_id)
        print("***Deep Crawling Phrase***")
        deep_crawl([entry_id], 3, 100)
        print(
            "Deep crawling completed and record extended with crawled_links,header_text,paragraph_text,social_media_links,telephone numbers,emails,addresses"
        )
        print("***Feature Extraction Phrase***")
        extract_features([entry_id])

        print("***Contact Person Extraction Phrase***")
        extract_contact_persons([entry_id], 'comp')

        print(("***Predict the company type***"))
        predict_class_tags([entry_id])

        print(("***Extract crunchbase profile data***"))
        get_cb_data([entry_id])

        print(("***Extract linkedin profile data***"))
        get_li_data([entry_id])

        print(("***Extract opencorporates profile data***"))
        get_oc_data([entry_id])

        print(("***Extract dnb profile data***"))
        get_dnb_data([entry_id])

        print("***Addresses from google***")
        get_ad_from_google([entry_id])
        print("***Addresses extraction completed***")

        print("***cp from google***")
        get_cp_from_google([entry_id])
        print("***cp extraction completed***")

        print("***phone numbers from google***")
        get_tp_from_google([entry_id])
        print("***tp extraction completed***")

        print("***frequent questions google***")
        get_qa_from_google([entry_id])
        print("***qa extraction completed***")

        print(("***Dumping the results***"))
        # export_profiles([entry_id],record_entry.inserted_id)
        simplified_export([entry_id])
        ended = time.time()
        duration = ended - started
        dateTimeObj_e = datetime.now()
        completion_data = {
            'completed_time_stamp': dateTimeObj_e,
            'elapsed_time': duration
        }
        print(completion_data)
        query_collection.update_one({'_id': record_entry.inserted_id},
                                    {'$set': completion_data})
        print("Pipeline execution completed, elapsed time:", duration)