def merge_int_edges(edge_count_max):
    edge_int_dict = {}
    edge_file_temp = open(dcrconfig.ConfigManager().IntegerEdegesFile.replace('int_edges', 'int_edges_temp'), 'w')
    edge_file_new = open(dcrconfig.ConfigManager().IntegerEdegesFile.replace('int_edges', 'int_edges_new'), 'r')
    for each_line_new in edge_file_new:
        edge_count_new = int((each_line_new.split(' '))[0])
        edge1_new = int((each_line_new.split(' '))[1])
        edge2_new = int((each_line_new.split(' '))[2])
        edge_weight_new = int((each_line_new.split(' '))[3])
        edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r')
        repeat_flag = 0
        for each_line in edge_file:
            edge_count = int((each_line.split(' '))[0])
            edge1 = int((each_line.split(' '))[1])
            edge2 = int((each_line.split(' '))[2])
            edge_weight = int((each_line.split(' '))[3])
            if edge1_new == edge1 and edge2_new == edge2:
                repeat_flag = 1
                print('%d %d %d %d' % (edge_count, edge1, edge2, edge_weight_new), file=edge_file_temp)
                edge_int_dict[(edge1, edge2)] = edge_count
        edge_file.close()
        if repeat_flag == 0:
            edge_count_max += 1
            print('%d %d %d %d' % (edge_count_max, edge1_new, edge2_new, edge_weight_new), file=edge_file_temp)
            edge_int_dict[(edge1_new, edge2_new)] = edge_count_max
    edge_file_new.close()
    edge_file_temp.close()
예제 #2
0
def nounphrase_generate():
    c = MongoClient(dcrconfig.ConfigManager().Datadb)
    db = c[config.ConfigManager().IntelligenceDb]
    col = db[config.ConfigManager().IntelligenceDataCollection]
    docs = col.find({'nounPhrases': ""}, {
        "description": 1,
        "doc_id": 1,
        "_id": 1
    })

    mongoport = int(config.ConfigManager().MongoDBPort)
    connection = dbmanager.mongoDB_connection(mongoport)

    for doc in docs:
        try:
            data = {}
            data['desc'] = doc['description']
            data['_id'] = doc['_id']
            data['doc_id'] = doc['doc_id']
            data['connection'] = connection
            q.put(data)

        except BaseException as ex:
            exception_message = '\n' + 'Exception:' + '\n'
            str(datetime.datetime.now()) + '\n'
            exception_message += 'File: ' + '\n'
            exception_message += '\n' + str(ex) + '\n'
            exception_message += '-' * 100
            utility.write_to_file(
                dcrconfig.ConfigManager().SemanticGraphLogFile, 'a',
                exception_message)
def generate_document_graphs(dict, edge_dict):
    #  Loop thru all phrase files and generate the integer graph
    phrase_file = open(dcrconfig.ConfigManager().DistinctPhraseFile, 'r')
    jdcount = 0
    graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight

    for line in phrase_file:
        line = line.strip()

        if (line.startswith('--')):
            #  If the line starts with -- then it is job descriptin begenning
            #  So print a dot indicate the progress
            print('.', end='')
            sys.stdout.flush()
            doc = line.strip()

        if not (line.startswith('--') or len(line.strip()) < 1):
            graph = dcrgraph.create_graph_distant_neighbors(line, graph_weight)
            graph = dcrgraph.generate_document_integer_graph(dict,
                                                             graph,
                                                             doc,
                                                             edge_dict)
            jdcount += 1
            if jdcount % 10 == 0:
                print('%d' % jdcount)
예제 #4
0
def create_document_graph_distant_neighbors(phrase_string,
                                            neighborCount,
                                            diminition_percent,
                                            edge_weight=1):
    phrase_sentences = phrase_string.split('.')
    base_graph = create_graph(phrase_string.replace('.', ''))
    neighbor_sensitive_graph = nx.Graph()

    for sent in phrase_sentences:
        ph = sent.split('|')
        phrases = [s for s in ph if len(s) > 2]
        neighbor_sensitive_graph.add_nodes_from(phrases)

        neighborPhrasesList = list(product(enumerate(phrases), repeat=2))

        edge_weight = dcrconfig.ConfigManager().GraphEdgeWeight

        for neighbor in neighborPhrasesList:
            if (neighbor[0])[0] < (neighbor[1])[0]:
                neighborDistance = (neighbor[1])[0] - (neighbor[0])[0]
                if neighborDistance <= neighborCount:
                    if neighborDistance == 1:
                        edge_weight = dcrconfig.ConfigManager().GraphEdgeWeight
                    else:
                        edge_weight = math.floor(
                            dcrconfig.ConfigManager().GraphEdgeWeight *
                            (diminition_percent / 100)**(neighborDistance - 1))

                    neighbor_sensitive_graph.add_edge((neighbor[0])[1],
                                                      (neighbor[1])[1],
                                                      weight=edge_weight)

    union_graph(base_graph, neighbor_sensitive_graph)
    return base_graph
예제 #5
0
def st_create_graph_distant_neighbors(phrase_string, edge_weight=1):
    phrase_sentences = phrase_string.split('.')
    base_graph = create_graph(phrase_string.replace('.', ''))
    neighbor_sensitive_graph = nx.Graph()
    diminition_percent = dcrconfig.ConfigManager().STDiminitionPercentage

    for sent in phrase_sentences:
        ph = sent.split('|')
        phrases = [s for s in ph if len(s) > 2]
        neighbor_sensitive_graph.add_nodes_from(phrases)

        # Add all edges
        phrase_len = len(phrases)
        for i in range(phrase_len - 1):
            edge_weight = dcrconfig.ConfigManager().STGraphEdgeWeight
            for j in range(i + 1, phrase_len):
                neighbor_sensitive_graph.add_edge(phrases[i],
                                                  phrases[j],
                                                  weight=edge_weight)

                #   Reduce the graph weight by the predefined percentage
                edge_weight = math.floor(edge_weight * diminition_percent /
                                         100)
                #   If the edge_weight diminishes to less than 1,
                #   then you don't need to proceed.
                if edge_weight < 1:
                    break

    union_graph(base_graph, neighbor_sensitive_graph)
    return base_graph
예제 #6
0
def neighbor_count_for_edge_weight():
    neighborCount = 0
    edge_weight = dcrconfig.ConfigManager().GraphEdgeWeight
    diminition_percent = dcrconfig.ConfigManager().DiminitionPercentage
    while True:
        neighborCount += 1
        edge_weight = math.floor(edge_weight * diminition_percent / 100)
        if edge_weight < 1:
            break
    return neighborCount
def save_node_dict():
    import pickle
    print('Reading Semantic Graph...')
    graph = nx.read_gexf(dcrconfig.ConfigManager().SemanticGraphFile)

    # create an integer mapping for each of the phrases
    # mapping_dict = dcrgraph.create_node_dictionary(graph.nodes())
    old_node_dict = load_node_dict()
    mapping_dict = dcrgraph.append_node_dictionary(graph.nodes(), old_node_dict)
    pickle.dump(mapping_dict, open(dcrconfig.ConfigManager().NodeFile, 'wb'))
    print('Saving nodes completed')
def automate_processes():
    utility.write_to_file(
        dcrconfig.ConfigManager().SemanticGraphLogFile, 'a',
        'Knowledge build automation running..! ' +
        str(datetime.datetime.now()))
    try:
        # Copies files from the previous cycle
        exec(open('filecopy.py').read(), globals())
        # Copy the noun phrase text from Mongo DB
        exec(open('dbtophrasefile.py').read(), globals())
        # Remove ngram anything above 3 or more words.
        exec(open('ngramremoval.py').read(), globals())
        # Remove duplicates and save it in new distinct phrase file.
        exec(open('duplicatefinder.py').read(), globals())
        # Checks if there is an existing semantic graph, if yes load and update
        # with new documents else create a new semantic graph and store.
        # Normally, this is run after n gram removal and duplicate
        # find and removal.
        exec(open('dcrgraphgenerator.py').read(), globals())
        # Read the semantic graph which is saved using dcrgraphgenerator.py
        # and read the document phrase file and create optimized integer
        # semantic edge file.
        exec(open('dcrgraphcompactor.py').read(), globals())
        # Save the node dictionary using pickle to file. This will be used by
        # above programs for finding node ids
        exec(open('savenodes.py').read(), globals())
        # Generate document integer graph and store. This will be used for
        # searching the documents.
        # exec(open('dcrdocumentintgraphgenerator.py').read(), globals())
        # Copy the noun phrase text from Mongo DB (Intelligence collection)
        exec(open('stdbtophrasefile.py').read(), globals())
        # Remove ngram anything above 3 or more words.
        exec(open('ngramremoval.py').read(), globals())
        # Remove duplicates and save it in new distinct phrase file.
        exec(open('duplicatefinder.py').read(), globals())
        # Checks if there is an existing semantic graph, if yes load and update
        # with new documents else create a new semantic graph and store.
        # Normally, this is run after n gram removal and duplicate
        # find and removal.
        exec(open('stdcrgraphgenerator.py').read(), globals())
        # Read the semantic graph which is saved using dcrgraphgenerator.py
        # and read the document phrase file and create optimized integer
        # semantic edge file.
        exec(open('stdcrgraphcompactor.py').read(), globals())
        # Save the node dictionary using pickle to file. This will be used by
        # above programs for finding node ids
        exec(open('savenodes.py').read(), globals())
        # Transfer generated intelligence files
        exec(open('filetransfer.py').read(), globals())
    except BaseException as ex:
        utility.log_exception_file(
            ex,
            dcrconfig.ConfigManager().SemanticGraphLogFile)
def generate_nodes():
    semantic_edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r')
    node_file = open(dcrconfig.ConfigManager().IntegerNodesFile, 'w')

    nodes = set()
    for line in semantic_edge_file:
        words = line.split()
        nodes.add(words[0])
        nodes.add(words[1])

    print('Saving integer nodes to file ...')
    for node in nodes:
        print("%s" % node, file=node_file)
def generate_document_graph_images():
    #  Loop thru all phrase files and generate the integer graph
    phrase_file = open(dcrconfig.ConfigManager().DocumentsEdgesIntegerFile,
                       'r')
    jdcount = 0
    doc = ''

    node_collection = []
    for line in phrase_file:
        line = line.strip()

        if (line.startswith('--')):
            #  If the line starts with -- then it is job descriptin begenning
            #  So print a dot indicate the progress
            print('.', end='')
            sys.stdout.flush()
            if node_collection:
                generate_graph_image(node_collection, doc)
                node_collection = []
                jdcount += 1

            doc = line.strip()
        if not (line.startswith('--') or len(line.strip()) < 1):
            node_collection.append(line.split(' '))

        if jdcount > 50:
            break
        elif jdcount % 20 == 0:
            plt.close('all')
def generate_req_candidate_file_selected_req(req_list):

    candidate_list = list(
        candidates.find({}, {
            "candidateid": 1,
            "requirementIDList": 1
        }))

    # Remove the duplicates if it is coming from another list
    distinct_req_list = list(set(req_list))

    for req in distinct_req_list:
        # File name is the requirement Id with the path from config
        req_file_name = dcrconfig.ConfigManager().SmartTrackDirectory
        req_file_name += str(req)

        # Clear the file. This can be changed to
        # append if only the new candidates are  picked in the candidate list
        open(req_file_name, 'w').close()

        # Find candidates for the requirement and generate the
        # req candidate file.
        req_candidate_list = find_candidates(candidate_list, req)
        generate_req_candidate_file_edge_dict_from_file(
            req_file_name, req_candidate_list)
예제 #12
0
def generate_document_integer_graph(integer_dict,
                                    document_graph,
                                    document,
                                    edge_dict,
                                    file_operation='a',
                                    edge_file_path=''):
    #   Generate compacted/Integer subgraph based on the integer dictionary.
    #   If the file_operation = a append to the configuration document graph

    #   Step1: Remove the nodes that is not present in the dictionary
    document_graph = remove_missing_nodes(integer_dict, document_graph)

    #   Step2: Integerize nodes
    graph = relabel_nodes(document_graph, integer_dict)

    #   Step3: Save int the file
    #   Check if the file path is not empty
    if edge_file_path == '':
        edge_file_path = dcrconfig.ConfigManager().DocumentsEdgesIntegerFile

    edge_file = open(edge_file_path, file_operation)
    print(document, file=edge_file)
    for edge in graph.edges(data=True):
        edge1 = edge[0]
        edge2 = edge[1]
        if edge1 > edge2:
            edge1 = edge[1]
            edge2 = edge[0]
        key = edge1, edge2
        if key in edge_dict:
            print('%d %d %d %d' %
                  (edge_dict[key], edge1, edge2, edge[2]['weight']),
                  file=edge_file)
    edge_file.close()
    return graph
def generate_document_signature_graph(dict, edge_dict, noun_phrases, neighborCount, diminition_percent):

    graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight
    # noun_phrases = "|human|asst|level|performs variety|general personnel clerical tasks|areas|employee|education training|employment|compensation|equal employment opportunity.|personnel|compiles sensitive|confidential personnel.|accordance|information.|provides information|personnel.|mba hr."
    # graph = dcrgraph.create_document_graph_distant_neighbors(noun_phrases, neighborCount, diminition_percent, graph_weight)
    graph = dcrgraph.create_graph_distant_neighbors_with_generator(noun_phrases, graph_weight)
    graph = dcrgraph.graph_to_signature_graph(dict, graph, edge_dict)
    return graph
예제 #14
0
def job_info_analysis(page, filepath, dbrecordcount):
    global totalrecords
    global invalidrecords
    global emptydesc
    global incompletedesc
    global smalldesc
    global nonedesc
    global nodesc
    global totaljobsdict
    global jobsitedict

    dict_object_record_list = []
    for jobinfo in page.findall('record'):
        try:
            # creating dictionary from xml tag contents
            dict_object = utility.xml_to_dict(ET.tostring(jobinfo))
            # totaljobsdict = fill_job_by_site(filepath)
            # totalrecords += 1

            # outer if check is jobdescription tag is in the xml
            if 'jobdescription' in (dict_object['record']):
                # checking if job description is none
                if ((dict_object['record'])['jobdescription'] is not None):

                    incorrectjobdescription = 0

                    if (((dict_object['record'])['jobdescription']).strip()
                        ) == '':
                        incorrectjobdescription = 1

                    if (len(((dict_object['record'])['jobdescription'])) < 20):
                        incorrectjobdescription = 1

                    if (((dict_object['record'])['jobdescription']
                         ).strip()[-3:]) == '...':
                        incorrectjobdescription = 1

                    if (incorrectjobdescription == 0):
                        (dict_object['record']
                         )['dateCreated'] = datetime.datetime.now()
                        (dict_object['record']
                         )['dateModified'] = datetime.datetime.now()
                        (dict_object['record'])['createdUser'] = '******'
                        (dict_object['record'])['modifiedUser'] = '******'
                        (dict_object['record'])['source'] = 'PromptCloud'
                        #(dict_object['record'])['Url'] = page['pageurl']
                        dict_object_record_list.append(dict_object['record'])
                        dbrecordcount += 1

        except BaseException as ex:
            utility.log_exception_file(
                ex,
                dcrconfig.ConfigManager().SemanticGraphLogFile)
    if dict_object_record_list:
        insert_to_db(dict_object_record_list)
    # updating doc_id in config table

    return dbrecordcount
def create_edges_file_with_dict():
    print('Reading Semantic Graph...')
    graph = nx.read_gexf(dcrconfig.ConfigManager().SemanticGraphFile)

    edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r')
    edge_int_dict_old = {}
    last_line = '0 '
    for each_line in edge_file:
        last_line = each_line
        edge_count = int((each_line.split(' '))[0])
        edge1 = int((each_line.split(' '))[1])
        edge2 = int((each_line.split(' '))[2])
        edge_weight = int((each_line.split(' '))[3])
        edge_int_dict_old[(edge1, edge2)] = [edge_count, edge_weight]

    edge_count_old_max = int((last_line.split(' '))[0])
    edge_file.close()
    # create an integer mapping for each of the phrases
    # mapping_dict = dcrgraph.create_node_dictionary(graph.nodes())
    old_node_dict = load_node_dict()
    mapping_dict = dcrgraph.append_node_dictionary(graph.nodes(), old_node_dict)
    # writer = csv.writer(open('/mnt/nlpdata/nodedict.csv', 'w'))
    # for key, value in mapping_dict.items():
    #     writer.writerow([key, value])

    new_graph = nx.relabel_nodes(graph, mapping_dict)


    edge_int_dict = {}
    edge_count = 0

    # Loop thru the edges. Compare the nodes order the first node be greater
    # than the second one. This will help in compressing the graph
    for edge in new_graph.edges(data=True):
        edge1 = edge[0]
        edge2 = edge[1]
        if edge1 > edge2:
            edge1 = edge[1]
            edge2 = edge[0]
        edge_count += 1

        edge_int_dict[(edge1, edge2)] = [int(edge_count), int(edge[2]['weight'])]
    edge_file.close()
    merge_int_edges_with_dict(edge_count_old_max, edge_int_dict_old, edge_int_dict)
    generate_nodes()
def get_normalized_dictionary_from_int_edges():
    edge_int_dict = {}
    edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r')
    for each_line in edge_file:
        edge_count = int((each_line.split(' '))[0])
        edge1 = int((each_line.split(' '))[1])
        edge2 = int((each_line.split(' '))[2])
        edge_int_dict[(edge1, edge2)] = edge_count

    return edge_int_dict
예제 #17
0
def update_graph():
    '''Load the existing graph and update with new set of job description
    from predefined locations based on the application.ini file'''
    semantic_graph = load_graph()

    phrase_file = open(dcrconfig.ConfigManager().DistinctPhraseFile, 'r')
    '''Get the config values'''
    graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight
    graph_filter_weight = dcrconfig.ConfigManager().FilterGraphEdgeWeight
    print("weight:%d filter weight: %d" % (graph_weight, graph_filter_weight))

    # graph_collection = []
    jdcount = 0

    for line in phrase_file:
        try:
            line = line.strip()

            if not (line.startswith('--') or len(line.strip()) < 1):
                graph = dcrgraph.create_graph_distant_neighbors(line, graph_weight)
                dcrgraph.union_graph(semantic_graph, graph, graph_weight)
                jdcount += 1
            elif (line.startswith('--')):
                ''' If the line starts with -- then it is job descriptin begenning
                    So print a dot indicate the progress '''
                print('.', end='')
                if jdcount % 1000 == 0:
                    print('%d' % jdcount)
                sys.stdout.flush()
        except BaseException as ex:
            utility.log_exception_file(ex, dcrconfig.ConfigManager().SemanticGraphLogFile)

    count = list((d['weight']) for u, v, d in
                 semantic_graph.edges_iter(data=True)
                 if d['weight'] > graph_filter_weight)

    ''' nx.write_gexf(semantic_graph,
                    dcrconfig.ConfigManager().SemanticGraphFile)'''
    mx = max(d for d in count)
    print('mx : %d, total jd processed : %d ' % (mx, jdcount))
    print('Semantic Graph Info: %s' % nx.info(semantic_graph))
    return semantic_graph
예제 #18
0
def job_info_analysis_storage(page_dict_object, filepath, dbrecordcount):
    global totalrecords
    global invalidrecords
    global emptydesc
    global incompletedesc
    global smalldesc
    global nonedesc
    global nodesc
    global totaljobsdict
    global jobsitedict

    dict_object_record_list = []
    try:
        dict_object = page_dict_object['page']
        # outer if check is jobdescription tag is in the xml
        if 'jobdescription' in (dict_object['record']):
            # checking if job description is none
            if ((dict_object['record'])['jobdescription'] is not None):

                incorrectjobdescription = 0

                if (((dict_object['record'])['jobdescription']).strip()) == '':
                    incorrectjobdescription = 1

                if (len(((dict_object['record'])['jobdescription'])) < 20):
                    incorrectjobdescription = 1

                if (((dict_object['record'])['jobdescription']).strip()[-3:]
                    ) == '...':
                    incorrectjobdescription = 1

                if (incorrectjobdescription == 0):
                    (dict_object['record']
                     )['dateCreated'] = datetime.datetime.now()
                    (dict_object['record']
                     )['dateModified'] = datetime.datetime.now()
                    (dict_object['record'])['createdUser'] = '******'
                    (dict_object['record'])['modifiedUser'] = '******'
                    (dict_object['record'])['source'] = 'PromptCloud'
                    (dict_object['record'])['Url'] = dict_object['pageurl']
                    (dict_object['record'])['fileName'] = filepath.replace(
                        config.ConfigManager().PCFileFolder + '/', '')
                    dict_object_record_list.append(dict_object['record'])
                    dbrecordcount += 1

    except BaseException as ex:
        utility.log_exception_file(
            ex,
            dcrconfig.ConfigManager().SemanticGraphLogFile)
    if dict_object_record_list:
        insert_to_db(dict_object_record_list)
    # updating doc_id in config table

    return dbrecordcount
예제 #19
0
def load_graph():
    '''Load semantic graph if it is already present in the system'''
    semantic_graph = nx.Graph()
    semantic_graph_path = dcrconfig.ConfigManager().SemanticGraphFile
    if os.path.isfile(semantic_graph_path):
        print("File found")
        semantic_graph = nx.read_gexf(semantic_graph_path)
        print('Semantic Graph Info: %s' % nx.info(semantic_graph))
    else:
        print('No existing semantic graph found')

    return semantic_graph
예제 #20
0
def remove_ngram_from_allphrasefile():
    utility.write_to_file(
        dcrconfig.ConfigManager().SemanticGraphLogFile, 'a',
        'Semantic graph Generation Step 5..! (ngramremoval.py) ' +
        str(datetime.datetime.now()))
    # Loop thru all phrase files and generate the integer graph
    phrase_file = open(dcrconfig.ConfigManager().PhraseFile, 'r')
    ng_phrase_file = open(dcrconfig.ConfigManager().NGramFilteredPhraseFile,
                          'w')

    for line in phrase_file:
        line = line.strip()
        if (line.startswith('--')):
            #  If the line starts with -- then it is job descriptin beginning
            #  So print a dot indicate the progress
            print('.', end='')
            sys.stdout.flush()
            print(line, file=ng_phrase_file)
            # If the line doesn't start with -- or is not empty space
        if not (line.startswith('--') or len(line.strip()) < 1):
            print(remove_ngram(line), file=ng_phrase_file)
def merge_int_edges_with_dict(edge_count_max, edge_int_dict_old, edge_int_dict):
    edge_file_new = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'w')
    edge_int_dict_new = edge_int_dict_old.copy()
    for key in edge_int_dict:
        if key in edge_int_dict_old:
            edge_int_dict_new[key] = [(edge_int_dict_old[key])[0], (edge_int_dict[key])[1]]
        else:
            edge_count_max += 1
            edge_int_dict_new[key] = [edge_count_max, (edge_int_dict[key])[1]]
    edge_int_dict.clear()
    edge_int_dict_old.clear()
    for key in sorted(edge_int_dict_new.keys(), key=lambda k: edge_int_dict_new[k][0]):
        print('%d %d %d %d' % ((edge_int_dict_new[key])[0], int(key[0]), int(key[1]), (edge_int_dict_new[key])[1]), file=edge_file_new)

    edge_file_new.close()
def generate_document_graphs_from_dict_list_savetodb(dict, edge_dict,
                                                     noun_phrases):
    #  Loop thru all phrase files and generate the integer graph

    jdcount = 0
    graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight

    graph = dcrgraph.create_graph_distant_neighbors(noun_phrases,
                                                    graph_weight)
    graph = dcrgraph.generate_document_integer_graph_savetodb(dict,
                                                              graph,
                                                              edge_dict)
    jdcount += 1
    if jdcount % 10 == 0:
        # print('%d' % jdcount)
        test = ''
    return graph
def append_edges_file():
    print('Reading Semantic Graph...')
    graph = nx.read_gexf(dcrconfig.ConfigManager().SemanticGraphFile)

    # create an integer mapping for each of the phrases
    # mapping_dict = dcrgraph.create_node_dictionary(graph.nodes())
    old_node_dict = load_node_dict()
    mapping_dict = dcrgraph.delta_node_dictionary(graph.nodes(), old_node_dict)
    # writer = csv.writer(open('/mnt/nlpdata/nodedict.csv', 'w'))
    # for key, value in mapping_dict.items():
    #     writer.writerow([key, value])

    new_graph = nx.relabel_nodes(graph, mapping_dict)
    print('Saving integer Semantic graph...')
    edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r')
    print(dcrconfig.ConfigManager().IntegerEdegesFile)
    last_line = '0 '
    for each_line in edge_file:
        last_line = each_line
    # last_line = each_line
    edge_int_dict = {}
    edge_count = int((last_line.split(' '))[0])
    print(last_line)
    print(edge_count)
    edge_file.close()
    edge_file_append = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'a')
    if new_graph.edges(data=True):
        print('%s' % (''), file=edge_file_append)
    # Loop thru the edges. Compare the nodes order the first node be greater
    # than the second one. This will help in compressing the graph
    for edge in new_graph.edges(data=True):
        if (isinstance(edge[0], int) and isinstance(edge[1], int)):
            edge1 = edge[0]
            edge2 = edge[1]
            if edge1 > edge2:
                edge1 = edge[1]
                edge2 = edge[0]
            edge_count += 1
            print(edge1, edge2)
            print(edge_count, edge1, edge2, edge[2]['weight'])
            print('%d %d %d %d' % (edge_count, edge1, edge2, edge[2]['weight']),
                  file=edge_file_append)
            edge_int_dict[(edge1, edge2)] = edge_count

    edge_file_append.close()
    rewrite_strip_edges_file(dcrconfig.ConfigManager().IntegerEdegesFile, (dcrconfig.ConfigManager().IntegerEdegesFile).replace('int_edges', 'int_edges_temp'))
    rewrite_strip_edges_file((dcrconfig.ConfigManager().IntegerEdegesFile).replace('int_edges', 'int_edges_temp'), dcrconfig.ConfigManager().IntegerEdegesFile)
    generate_nodes()
def generate_document_graphs_from_dict_list(dict, edge_dict, list, directory):
    #  Loop thru all phrase files and generate the integer graph

    jdcount = 0
    graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight

    for listitem in list:
        doc = '---' + str(listitem['doc_id']) + '---'
        filepath = directory + '/' + str(doc)
        graph = dcrgraph.create_graph_distant_neighbors(listitem
                                                        ['nounPhrases'],
                                                        graph_weight)
        graph = dcrgraph.generate_document_integer_graph(dict,
                                                         graph,
                                                         doc,
                                                         edge_dict, 'w',
                                                         filepath)
        jdcount += 1
        if jdcount % 10 == 0:
            print('%d' % jdcount)
예제 #25
0
def generate_nounphrase_insert_into_db(data):
    global count
    try:
        status = "{:<8}".format(str(count)) + " :"
        status += str(datetime.datetime.now())
        count += 1
        mongoport = int(config.ConfigManager().MongoDBPort)
        col = config.ConfigManager().IntelligenceDataCollection
        desc = data['desc']

        noun_phrases = dcrnlp.extract_nounphrases_sentences(desc)

        UpdateTemplateWhere = utility.clean_dict()
        UpdateTemplateSet = utility.clean_dict()
        DBSet = utility.clean_dict()
        UpdateTemplateWhere['_id'] = data['_id']
        UpdateTemplateSet['nounPhrases'] = noun_phrases
        UpdateTemplateSet['description'] = desc
        DBSet['$set'] = UpdateTemplateSet

        status += " |" + str(datetime.datetime.now())
        custom.update_data_to_Db_con(mongoport,
                                     config.ConfigManager().IntelligenceDb,
                                     col, UpdateTemplateWhere, DBSet,
                                     data['connection'])

        status += " |" + str(datetime.datetime.now())
        status += " :" + "{:<9}".format(str(data['doc_id']))
        print(status)

    except BaseException as ex:
        exception_message = '\n' + 'Exception:' + '\n'
        str(datetime.datetime.now()) + '\n'
        exception_message += 'File: ' + '\n'
        exception_message += '\n' + str(ex) + '\n'
        exception_message += '-' * 100
        utility.write_to_file(dcrconfig.ConfigManager().SemanticGraphLogFile,
                              'a', exception_message)
def get_normalized_dictionary():
    print('Reading Semantic Graph...')
    graph = nx.read_gexf(dcrconfig.ConfigManager().SemanticGraphFile)

    # create an integer mapping for each of the phrases
    mapping_dict = load_node_dict()
    new_graph = nx.relabel_nodes(graph, mapping_dict)

    edge_int_dict = {}
    edge_count = 0

    # Loop thru the edges. Compare the nodes order the first node be greater
    # than the second one. This will help in compressing the graph
    for edge in new_graph.edges(data=True):
        edge1 = edge[0]
        edge2 = edge[1]
        if edge1 > edge2:
            edge1 = edge[1]
            edge2 = edge[0]
        edge_count += 1
        edge_int_dict[(edge1, edge2)] = edge_count

    return edge_int_dict
def load_document_edges():
    edge_file = open(dcrconfig.ConfigManager().DocumentsEdgesIntegerFile, 'r')
    jdcount = 0

    docs = []
    doc_edges = []
    doc_id = 0
    for line in edge_file:
        line = line.strip()

        if (line.startswith('--')):
            #  If the line starts with -- then it is job descriptin beginning
            #  So print a dot indicate the progress
            if (len(doc_edges) > 0):
                doc = {'id': doc_id, 'edges': doc_edges}
                docs.append(doc)
                doc_edges = []
            doc_id = int(line.strip('-'))
            jdcount += 1

        if not (line.startswith('--') or len(line.strip()) < 1):
            doc_edges.append(int(line.split(' ')[0]))
    return docs
def generate_document_graphs_from_list(dict,
                                       edge_dict,
                                       candidates,
                                       req_cand_file):

    #  Loop thru all phrase files and generate the integer graph
    jdcount = 0
    graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight

    for candidate in candidates:
        line = candidate["phrases"]
        doc = '---' + str(candidate["id"]) + '---'

        print("writing %s" % req_cand_file)
        graph = dcrgraph.create_graph_distant_neighbors(line, graph_weight)
        graph = dcrgraph.generate_document_integer_graph(dict,
                                                         graph,
                                                         doc,
                                                         edge_dict,
                                                         'a',
                                                         req_cand_file)
        jdcount += 1
        if jdcount % 10 == 0:
            print('%d' % jdcount)
#!/usr/bin/python3.4
#   File transfer.
#   Runs shell script file to transfer files

import subprocess
import os
import config
import utility
import dcrconfig

if __name__ == "__main__":
    fileTransferDestination = config.ConfigManager(
    ).webServerIp + ':' + config.ConfigManager().mountDirectory
    semanticGraph = dcrconfig.ConfigManager().SemanticGraphFile.replace(
        config.ConfigManager().mountDirectory + '/', '')
    intEdges = dcrconfig.ConfigManager().IntegerEdegesFile.replace(
        config.ConfigManager().mountDirectory + '/', '')
    nodeDict = dcrconfig.ConfigManager().NodeFile.replace(
        config.ConfigManager().mountDirectory + '/', '')

    # Transferring knowledge files before spark server reboot
    subprocess.call([
        config.ConfigManager().knowledgeFilesTransferScript,
        config.ConfigManager().webServerPassword, semanticGraph, intEdges,
        nodeDict, fileTransferDestination,
        config.ConfigManager().knowledgeFilesBackup
    ])
예제 #30
0
#!/usr/bin/python3.4
#   Generates integer graphs for documents from a phrase file
#   Reads a graph from a predefined file and optimizes
#   by converting it into integer nodes

import networkx as nx
import dcrgraphcompactor
import dcrconfig
import utility
import datetime

#   main function entry
if __name__ == "__main__":
    utility.write_to_file(
        dcrconfig.ConfigManager().SemanticGraphLogFile, 'a',
        'Semantic graph Generation Step 10..! (dcrdocumentintgraphgenerator.py) '
        + str(datetime.datetime.now()))
    mapping_dict = dcrgraphcompactor.load_node_dict()
    # edge_int_dict = dcrgraphcompactor.get_normalized_dictionary()
    edge_int_dict = dcrgraphcompactor.get_normalized_dictionary_from_int_edges(
    )
    print('Saving Integer Document Graphs...')
    dcrgraphcompactor.generate_document_graphs(mapping_dict, edge_int_dict)
    print("Successfully Completed.!")