示例#1
0
def query_state_update_via_queue():
    print("Query state updating queue is live")
    query_collection = refer_query_col()
    mycol = refer_collection()

    connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
    query_client = QueueClient.from_connection_string(connect_str,
                                                      "query-queue")

    while (True):
        # print('q')
        time.sleep(200)
        rows = query_client.receive_messages()
        for msg in rows:
            time.sleep(10)

            row = msg.content
            print(row)
            row = ast.literal_eval(row)
            print('getting_id', row[0])
            entry_id = ObjectId(row[0])
            query_data_entry = query_collection.find({"_id": entry_id})
            data = [i for i in query_data_entry]
            #check_for_the_completion_of_components
            try:
                associated_entries = data[0]['associated_entries']
                print('getting associated entries')
                completed_count = 0
                for each_entry_res in associated_entries:
                    res_entry = mycol.find({"_id": each_entry_res})
                    # print("profile",each_entry_res)
                    data_res = [i for i in res_entry]
                    if (data_res[0]['simplified_dump_state'] == 'Completed'):
                        completed_count += 1

                print('completed_count', completed_count)
                print('entry_count', data[0]['entry_count'])

                if (completed_count == data[0]['entry_count']):
                    print("All the entries are completed for the query",
                          completed_count)
                    query_collection.update_one(
                        {'_id': entry_id}, {'$set': {
                            'state': 'Completed'
                        }})
                    query_client.delete_message(msg)

            except KeyError as e:
                print('Query is not yet ready', e)
            except IndexError as e:
                print('Yet query entry not available')
            except Exception as e:
                print("Exception Occured during dumping ", e)
示例#2
0
def project_state_update_via_queue():
    print("Project state updating queue is live")
    proj_collection = refer_projects_col()
    query_collection = refer_query_col()

    connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
    project_comp_client = QueueClient.from_connection_string(
        connect_str, "project-completion-queue")

    while (True):
        # print('*')
        time.sleep(600)
        rows = project_comp_client.receive_messages()
        for msg in rows:
            time.sleep(10)
            row = msg.content
            row = ast.literal_eval(row)
            print(row[0])
            entry_id = ObjectId(row[0])
            project_data_entry = proj_collection.find({"_id": entry_id})
            data = [i for i in project_data_entry]
            #check_for_the_completion_of_components
            try:
                associated_queries = data[0]['associated_queries']
                completed_count = 0
                for each_query_res in associated_queries:
                    que_entry = query_collection.find({"_id": each_query_res})
                    data_res = [i for i in que_entry]
                    if (data_res[0]['state'] == 'Completed'):
                        completed_count += 1
                print(['comp', completed_count, data[0]['query_count']])
                if (completed_count == data[0]['query_count']):
                    print("All the queries are completed for the project",
                          completed_count)
                    proj_collection.update_one(
                        {'_id': entry_id}, {'$set': {
                            'state': 'Completed'
                        }})
                    project_comp_client.delete_message(msg)

            except KeyError as e:
                print('Project is not yet ready', e)
            except IndexError as e:
                print('Yet project entry not available')
            except Exception as e:
                print("Exception Occured during dumping ", e)
def get_entries_project(project_id):
    projects_col= refer_projects_col()
    query_collection = refer_query_col()
    proj_data_entry = projects_col.find({"_id": project_id})
    proj_data = [i for i in proj_data_entry]
    proj_attribute_keys = list(proj_data[0].keys())
    if ('associated_queries' in proj_attribute_keys):
        associated_queries = proj_data[0]['associated_queries']
        for each_query in associated_queries:
            query_data_entry = query_collection.find({"_id": ObjectId(each_query)})
            query_data = [i for i in query_data_entry]
            query_attribute_keys = list(query_data[0].keys())
            if ('associated_entries' in query_attribute_keys):
                associated_entries = query_data[0]['associated_entries']
                obs_ids = [ObjectId(i) for i in associated_entries]
                return obs_ids


    else:
        print("This project do not have any queries yet")
        return []
示例#4
0
def fix_entry_counts():
    print("Query state updating queue is live")
    query_collection = refer_query_col()
    mycol = refer_collection()

    connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
    query_client = QueueClient.from_connection_string(connect_str,
                                                      "query-queue")
    rows = query_client.receive_messages()
    for msg in rows:
        # time.sleep(10)

        row = msg.content
        print(row)
        row = ast.literal_eval(row)
        print('getting_id', row[0])
        entry_id = ObjectId(row[0])
        query_data_entry = query_collection.find({"_id": entry_id})
        data = [i for i in query_data_entry]
        # check_for_the_completion_of_components

        associated_entries = data[0]['associated_entries']
        print('getting associated entries')
        print(len(associated_entries))
def get_entries_project(project_id):
    completed_count = []
    incomplete_count = 0
    incompletes = []
    problems = []
    all_entires = []
    profile_col = refer_collection()
    projects_col = refer_projects_col()
    query_collection = refer_query_col()
    proj_data_entry = projects_col.find({"_id": project_id})
    print('proj', proj_data_entry)
    proj_data = [i for i in proj_data_entry]
    print('data', len(proj_data))
    proj_attribute_keys = list(proj_data[-1].keys())
    if ('associated_queries' in proj_attribute_keys):
        associated_queries = proj_data[-1]['associated_queries']
        for each_query in associated_queries:
            query_data_entry = query_collection.find(
                {"_id": ObjectId(each_query)})
            query_data = [i for i in query_data_entry]
            print([
                query_data[0]['search_query'], query_data[0]['state'],
                query_data[0]['_id']
            ])
            query_attribute_keys = list(query_data[0].keys())
            if ('associated_entries' in query_attribute_keys):
                associated_entries = query_data[0]['associated_entries']
                # print('kk',associated_entries)
                obs_ids = [ObjectId(i) for i in associated_entries]
                all_entires.extend(obs_ids)

                for k in obs_ids:
                    prof_data_entry = profile_col.find({"_id": k})
                    # print('proj', proj_data_entry)
                    prof_data = [i for i in prof_data_entry]
                    prof_attribute_keys = list(prof_data[0].keys())

                    if ('simplified_dump_state' in prof_attribute_keys):
                        if (prof_data[0]['simplified_dump_state'] ==
                                'Completed'):
                            completed_count.append(k)
                        # else:print(prof_data[0]['simplified_dump_state'])
                        elif (prof_data[0]['simplified_dump_state'] ==
                              'Incomplete'):
                            incomplete_count += 1
                            incompletes.append(k)
                        else:
                            problems.append(k)
                    else:
                        problems.append(k)
                #
                # print(['completed',completed_count,'all',len(obs_ids),'incompleted',incomplete_count,incompletes,'prob',problems])
                # # filt = []
                # # for k in obs_ids:
                # #     if(k not in problems):
                # #         filt.append(k)
                # # print('filt',filt)
                # if(completed_count==len(obs_ids)):
                #     query_collection.update_one({'_id': ObjectId(each_query)}, {'$set': {'state': 'Completed'}})

                # return obs_ids

        print('completed_count', len(list(set(completed_count))))
        print('incomplete_count', incomplete_count)
        print('incompletes', list(set(incompletes)))
        print('problems', list(set(problems)))
        print('all', all_entires)
        return {
            'incompletes': list(set(incompletes)),
            'problems': list(set(problems))
        }
        # all_entires = list(set(all_entires))m
        # return all_entires
    else:
        print("This project do not have any queries yet")
        return []
示例#6
0
def search_a_company(comp_name, db_collection, query_entry):
    try:
        sr = getGoogleLinksForSearchText(comp_name + " Australia", 10,
                                         'initial')
        count = 0
        while (sr == 'captcha'):
            count = count + 1
            print('captch detected and sleeping for n times n:', count)
            time.sleep(1200 * count)
            sr = getGoogleLinksForSearchText(comp_name, 10, 'initial')

        b_list_file = open(
            three_up + '//Simplified_System//Initial_Crawling//black_list.txt',
            'r')
        black_list = b_list_file.read().splitlines()
        # 'www.dnb.com'
        received_links = [i['link'] for i in sr]
        print(received_links)
        #filter the links
        filtered_sr_a = []
        filtered_received_links = []
        for i, each_l in enumerate(received_links):
            if (('.com/' in each_l) or ('.education/' in each_l)
                    or ('.io/' in each_l) or ('.com.au/' in each_l)
                    or ('.net/' in each_l) or ('.co.nz/' in each_l)
                    or ('.nz/' in each_l) or ('.au/' in each_l)
                    or ('.biz/' in each_l)):
                # print(each)
                filtered_received_links.append(each_l)
                filtered_sr_a.append(sr[i])

        print(filtered_sr_a)
        received_domains = [i.split("/")[2] for i in filtered_received_links]
        filtered_sr = []

        print('rd', received_domains)
        for i, each in enumerate(received_domains):
            # print(each)
            if (('.gov.' in each) or ('org' in each) or ('.govt.' in each)
                    or ('.edu.' in each)
                    or ('.uk' in each)):  # filter non wanted websites
                continue
            if each not in black_list:
                filtered_sr.append(filtered_sr_a[i])

        if (len(filtered_sr)):
            #is the link already taken
            res_data = is_profile_exist(filtered_sr[0]['link'])

            if (len(res_data)):
                print("Profile " + filtered_sr[0]['link'] +
                      " already existing at " + str(res_data[0]['_id']))
                query_collection = refer_query_col()
                query_collection.update_one(
                    {'_id': query_entry},
                    {'$set': {
                        'associated_entries': [res_data[0]['_id']]
                    }})
                return 'exist'
            #should fix comp name
            # print('fixing comp name')
            c_n_link = filtered_sr[0]['link']
            c_n_dom = c_n_link.split("/")[2]
            try:
                c_name = c_n_dom.split("www.")[1]
            except IndexError:
                c_name = c_n_dom
            if ('.com' in c_name):
                cc_name = c_name.split(".com")[0]
            elif ('.org' in c_name):
                cc_name = c_name.split(".org")[0]
            elif ('.io' in c_name):
                cc_name = c_name.split(".io")[0]
            elif ('.net' in c_name):
                cc_name = c_name.split(".net")[0]
            else:
                cc_name = c_name
            # print(filtered_sr[0]['link'])
            filtered_sr[0]['comp_name'] = cc_name
            filtered_sr[0]['query_id'] = query_entry
            filtered_sr[0]['ignore_flag'] = '0'
            rich_description = filtered_sr[0]['rich_description']
            if (rich_description != 'None' or rich_description != ''):
                filtered_sr[0]['description'] = rich_description
            record_entry = db_collection.insert_one(filtered_sr[0])
            print(filtered_sr[0])
            print("search record stored in db: ", record_entry.inserted_id)
            return record_entry.inserted_id
        else:
            print("No results found!")
            return None

    except Exception as e:
        print("Error occured! try again", e)
        return 'error'
示例#7
0
def search_a_query(search_query, number_of_results, db_collection,
                   query_entry):
    try:
        sr = getGoogleLinksForSearchText(search_query, number_of_results,
                                         'normal')
        print('came', len(sr))
        count = 0
        while (sr == 'captcha'):
            count = count + 1
            print('captch detected and sleeping for n times n:', count)
            time.sleep(1200 * count)
            sr = getGoogleLinksForSearchText(search_query, number_of_results,
                                             'normal')

        if (len(sr)):
            # print(sr)
            # record_entry = db_collection.insert_many(sr)

            for each_sr in sr:
                print(each_sr)
            received_links = [i['link'] for i in sr]
            filtered_received_links = []
            for each_l in received_links:
                if (('.com/' in each_l) or ('.education/' in each_l)
                        or ('.io/' in each_l) or ('.com.au/' in each_l)
                        or ('.net/' in each_l) or ('.co.nz/' in each_l)
                        or ('.nz/' in each_l) or ('.au/' in each_l)
                        or ('.biz/' in each_l)):
                    # print(each)
                    filtered_received_links.append(each_l)

            query_collection = refer_query_col()
            received_domains = [
                i.split("/")[2] for i in filtered_received_links
            ]
            print("received_domains", received_domains)
            received_domains = list(set(received_domains))
            print("received_domains", received_domains)
            ids_list = []
            already_existing_count = 0
            for k in range(len(received_domains)):
                time.sleep(10)
                print(received_links[k], received_domains[k])
                b_list_file = open(
                    three_up +
                    '//Simplified_System//Initial_Crawling//black_list.txt',
                    'r')
                black_list = b_list_file.read().splitlines()
                if (received_domains[k]
                        in black_list):  #filter non wanted websites
                    continue
                if (('.gov.' in received_domains[k])
                        or ('org' in received_domains[k])
                        or ('.govt.' in received_domains[k])
                        or ('.edu.' in received_domains[k]) or
                    ('.uk'
                     in received_domains[k])):  # filter non wanted websites
                    continue
                sr = getGoogleLinksForSearchText(received_domains[k], 3,
                                                 'initial')
                if (len(sr) == 0):
                    sr = getGoogleLinksForSearchText(received_domains[k], 3,
                                                     'initial')
                    if (len(sr) == 0):
                        sr = getGoogleLinksForSearchText(
                            received_domains[k], 3, 'initial')
                if (len(sr) > 0):
                    print(sr[0])
                    res_data = is_profile_exist(sr[0]['link'])
                    if (len(res_data)):
                        print("Profile " + sr[0]['link'] +
                              " already existing at " +
                              str(res_data[0]['_id']))
                        already_existing_count += 1
                        #updating associates

                        qq_data_entry = query_collection.find(
                            {"_id": query_entry})
                        qq_data = [i for i in qq_data_entry]
                        qq_attribute_keys = list(qq_data[0].keys())
                        if ('associated_entries' in qq_attribute_keys):
                            print('in main', )
                            query_collection.update_one({'_id': query_entry}, {
                                '$set': {
                                    'associated_entries':
                                    qq_data[0]['associated_entries'] +
                                    [res_data[0]['_id']]
                                }
                            })
                        else:
                            query_collection.update_one({'_id': query_entry}, {
                                '$set': {
                                    'associated_entries': [res_data[0]['_id']]
                                }
                            })
                        continue
                    sr[0]['search_text'] = search_query
                    try:
                        c_name = received_domains[k].split("www.")[1]
                    except IndexError:
                        c_name = received_domains[k]
                    if ('.com' in c_name):
                        sr[0]['comp_name'] = c_name.split(".com")[0]
                    elif ('.org' in c_name):
                        sr[0]['comp_name'] = c_name.split(".org")[0]
                    elif ('.co' in c_name):
                        sr[0]['comp_name'] = c_name.split(".co")[0]
                    elif ('.edu' in c_name):
                        sr[0]['comp_name'] = c_name.split(".edu")[0]
                    else:
                        sr[0]['comp_name'] = c_name
                    print(sr[0])
                    sr[0]['query_id'] = query_entry
                    sr[0]['ignore_flag'] = '0'
                    rich_description = sr[0]['rich_description']
                    if (rich_description != 'None' or rich_description != ''):
                        sr[0]['description'] = rich_description

                    record_entry = db_collection.insert_one(sr[0])
                    print("search record stored in db: ",
                          record_entry.inserted_id)
                    ids_list.append(record_entry.inserted_id)
                else:
                    print("Cannot find results, skipping company")
            print('from initial crawling', ids_list)

            entry_count = already_existing_count + len(ids_list)
            print("Total entry count", entry_count)
            query_collection.update_one({'_id': query_entry},
                                        {'$set': {
                                            'entry_count': entry_count
                                        }})
            return ids_list
            # print("search records stored in db: ", record_entry.inserted_ids)
        else:
            print("No results found!")
            return None
    except Exception as e:
        print("Error occured! try again", e)
        return 'error'

    #store file to a csv file
    # with open('search_results.csv', mode='w',encoding='utf8') as results_file:  # store search results in to a csv file
    #     results_writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    #
    #     for each_item in sr:
    #         results_writer.writerow([each_item['title'], each_item['link'], each_item['description']])
    #     results_file.close()


# mycol = refer_collection()
# search_a_company('TOOHEYS PTY LIMITED',mycol)
# search_a_company('CALTEX PETROLEUM PTY LTD',mycol)
# search_a_query('Digital advertisement and marketing analytics services company',5,mycol)
示例#8
0
def execute_for_a_company(comp_name):
    mycol = refer_collection()
    print("Searching a company")
    dateTimeObj = datetime.now()
    query_collection = refer_query_col()
    data_q = {'started_time_stamp': dateTimeObj, 'search_query': comp_name}
    record_entry = query_collection.insert_one(data_q)
    print("Started on", dateTimeObj)
    started = time.time()
    print("***Initial Crawling Phrase***")
    entry_id = search_a_company(comp_name, mycol, record_entry.inserted_id)
    if (entry_id == None):
        for i in range(3):
            print("Initial crawling incomple..retrying", i)
            entry_id = search_a_company(comp_name, mycol,
                                        record_entry.inserted_id)
            time.sleep(5)
            if (entry_id != None): break
    if (entry_id == None):
        print("Initial crawling incomple..retrying unsuccessful")
    elif (entry_id == 'exist'):
        print("Existing profile found. pipeline exits")
    else:
        print("entry id received ", entry_id)
        print("***Deep Crawling Phrase***")
        deep_crawl([entry_id], 3, 100)
        print(
            "Deep crawling completed and record extended with crawled_links,header_text,paragraph_text,social_media_links,telephone numbers,emails,addresses"
        )
        print("***Feature Extraction Phrase***")
        extract_features([entry_id])

        print("***Contact Person Extraction Phrase***")
        extract_contact_persons([entry_id], 'comp')

        print(("***Predict the company type***"))
        predict_class_tags([entry_id])

        print(("***Extract crunchbase profile data***"))
        get_cb_data([entry_id])

        print(("***Extract linkedin profile data***"))
        get_li_data([entry_id])

        print(("***Extract opencorporates profile data***"))
        get_oc_data([entry_id])

        print(("***Extract dnb profile data***"))
        get_dnb_data([entry_id])

        print("***Addresses from google***")
        get_ad_from_google([entry_id])
        print("***Addresses extraction completed***")

        print("***cp from google***")
        get_cp_from_google([entry_id])
        print("***cp extraction completed***")

        print("***phone numbers from google***")
        get_tp_from_google([entry_id])
        print("***tp extraction completed***")

        print("***frequent questions google***")
        get_qa_from_google([entry_id])
        print("***qa extraction completed***")

        print(("***Dumping the results***"))
        # export_profiles([entry_id],record_entry.inserted_id)
        simplified_export([entry_id])
        ended = time.time()
        duration = ended - started
        dateTimeObj_e = datetime.now()
        completion_data = {
            'completed_time_stamp': dateTimeObj_e,
            'elapsed_time': duration
        }
        print(completion_data)
        query_collection.update_one({'_id': record_entry.inserted_id},
                                    {'$set': completion_data})
        print("Pipeline execution completed, elapsed time:", duration)
示例#9
0
        rows = ic_client.receive_messages()
        for msg in rows:
            # time.sleep(120)
            row = msg.content
            row = ast.literal_eval(row)
            print(row[0])

            input_d = row[0].split("--")
            try:
                mode = input_d[1]
                s_text = input_d[0]
                if mode == 'query':
                    query = s_text.strip()
                    print("Searching a query")
                    dateTimeObj = datetime.now()
                    query_collection = refer_query_col()
                    data_q = {'started_time_stamp': dateTimeObj, 'search_query': query}
                    record_entry = query_collection.insert_one(data_q)
                    print("Started on", dateTimeObj)
                    started = time.time()
                    print("***Initial Crawling Phrase***")
                    entry_id_list = search_a_query(query, 10, mycol, record_entry.inserted_id)
                    if (entry_id_list == None):
                        for i in range(3):
                            print("Initial crawling incomplete..retrying", i)
                            entry_id_list = search_a_query(query, 10, mycol, record_entry.inserted_id)
                            time.sleep(5)
                            if (entry_id_list != None): break
                    if (entry_id_list == None):
                        print("Initial crawling incomplete..retrying unsuccessful")
                    elif (entry_id_list == 'error'):