Пример #1
0
    def __init__(self, inc_file=None, art_file=None, in_log=None):
        self.inc_file = inc_file
        self.art_file = art_file
        self.dataframe = None
        self.artifacts = None

        self.sif = ResSIF()
        self.log = in_log if in_log else logging.getLogger(__name__)
        self.inc_ids = []
        super(ResNLP, self).__init__()
Пример #2
0
parser.add_argument("-e", "--inc_sen",
                    help="json file of list of words for incidents",
                    default="inc_sen.json")

args, unknow_args = parser.parse_known_args()

sentence = args.sentence
inc_id = int(args.incident)
sif_file = args.sif
w2v_file = args.w2v
vec_file = args.vec
debug = args.debug
all_ids_file = args.all_ids
inc_sen_file = args.inc_sen
s_util = WordSentenceUtils()
sif = ResSIF()
sif.load_sif(sif_file)

w2v = ResNLP()
w2v.load_model(w2v_file)

vec = ResSen2Vec(w2v.word2vec, sif)
vec.load_s2v(vec_file)

inc_vec = vec.get_incident_vec(str(inc_id))
sen_vec = vec.get_vec_for_sentence(sentence)
u = []
with open("vec_en_new_pcs.json", "r") as infile:
    u = json.load(infile)

u = np.multiply(u, np.transpose(u))
Пример #3
0
                    "--w2v",
                    help="trained word2vec model",
                    default=FileManage.DEFAULT_NLP_FILE)
parser.add_argument("-v", "--verbose", action="store_true")

args, unknown_args = parser.parse_known_args()

COEFFICIENT_TEMPLATE = u"""\t\t%-30s %s"""

sen1 = args.sentence1
sen2 = args.sentence2
sif_file = args.sif
w2v_file = args.w2v
verbose = args.verbose

sif = ResSIF()
sif.load_sif(sif_file)

w2v = ResNLP()
w2v.load_model(w2v_file)

s2v = ResSen2Vec(w2v=w2v.word2vec, sif=sif)

sim = s2v.get_similarity(sen1, sen2)

s_util = WordSentenceUtils()

words_1 = s_util.get_words(sen1)
words_2 = s_util.get_words(sen2)

print("\nsen_sim:")
Пример #4
0
                    default=FileManage.DEFAULT_SIF_FILE)
parser.add_argument("-w", "--w2v",
                    help="trained word2vec model",
                    default=FileManage.DEFAULT_NLP_FILE)
parser.add_argument("-v", "--vec",
                    help="saved vectors for incidents",
                    default=FileManage.DEFAULT_VEC_FILE)

args, unknown_args = parser.parse_known_args()
sentence = args.sentence
sif_file = args.sif
w2v_file = args.w2v
vec_file = args.vec
num = int(args.num)

sif = ResSIF()
sif.load_sif(sif_file)

w2v = ResNLP()
w2v.load_model(w2v_file)

vec = ResSen2Vec(w2v.word2vec, sif)
vec.load_s2v(vec_file)

closest = vec.get_closest(sentence, num)
print("Find top {} closest incidents: ".format(num))
print("------------------------------")
print("\n")


for inc in closest:
Пример #5
0
#       check_sif.py _input_word -m (optional) model_file

from fn_machine_learning_nlp.lib.file_manage import FileManage
from fn_machine_learning_nlp.lib.nlp.res_sif import ResSIF
from fn_machine_learning_nlp.lib.nlp.res_sen2vec import ResSen2Vec
import argparse

parser = argparse.ArgumentParser(description="Find word count from SIF")
parser.add_argument("word", help="input word")
parser.add_argument("-s",
                    "--sif",
                    help="sif file serialized using python pickle",
                    default=FileManage.DEFAULT_SIF_FILE)

args, unknown_args = parser.parse_known_args()

word = args.word
sif_file = args.sif

print("check-sif:")
print("----------")
print("Check SIF word count for \'{}\' using sif file {}:\n".format(
    word, sif_file))

sif = ResSIF()
sif.load_sif(sif_file)

count = sif.get_word_count(word)
coefficient = ResSen2Vec.SIF_A / (ResSen2Vec.SIF_A + count)
print("\tword count:\t\t\t{}".format(count))
print("\tcoefficient:\t\t\t{}".format(coefficient))
def get_incident_href(nlp_str, res_client, num_return, model_path, inc_id):
    """
    For the given nlp_str, find the top num_return (old) incidents that
    are similar to it (from NLP point of view).

    Generate the href links for each of those returned incident as well.

    :param nlp_str:     input sentence to do nlp search
    :param res_client:  resilient client
    :param num_return:  number of closest incidents to return
    :param model_path:  (required) Specify the path to find the saved model
    :param inc_id:      (new) incident id. Don't include this in return.
    :return:
    """
    file_path = model_path
    if not file_path.endswith('/'):
        file_path += '/'

    sif_file = FileManage.DEFAULT_SIF_FILE
    w2v_file = FileManage.DEFAULT_NLP_FILE
    vec_file = FileManage.DEFAULT_VEC_FILE
    pca_file = FileManage.DEFAULT_PCA_FILE

    model_files = os.listdir(model_path)
    # If a custom model name was used to build, the specific files that make up the model can be identified by
    # the second half of the filename which is standardized depending on the type of file it is
    for filename in model_files:
        if "-sif.pkl" in filename:
            sif_file = filename
        elif "-w2v.txt" in filename:
            w2v_file = filename
        elif "-vec.json" in filename:
            vec_file = filename
        elif "-pca.json" in filename:
            pca_file = filename

    # SIF (Smooth Inverse Frequency) file
    sif = ResSIF()
    sif.load_sif(os.path.join(file_path, sif_file))

    # Word2Vec NLP model
    nlp = ResNLP()
    nlp.load_model(os.path.join(file_path, w2v_file))

    # sentence to vector
    vec = ResSen2Vec(nlp.word2vec, sif)
    # load cached vectors for old incidents
    vec.load_s2v(os.path.join(file_path, vec_file))

    # load pca
    vec.load_pca(os.path.join(file_path, pca_file))

    # find the highest inc id in the vec file. Note that the vec file contains
    # all the incidents at the point the model is built. We want to find incidents
    # created after that.
    highest_id = vec.get_highest_inc_id()
    res_utils = ResUtils(resclient=res_client)
    other_incidents = res_utils.get_incidents_after(highest_id)

    incident_ids = vec.get_closest(nlp_str, other_incidents, num_return, inc_id)
    hrefs = [{"inc_link": make_incident_href(inc["ref"], res_client.org_id, res_client.base_url),
              "similarity": inc["sim"],
              "keywords": inc["keywords"]} for inc in incident_ids]
    return hrefs
Пример #7
0
args, unknown_args = parser.parse_known_args()
inc_id = int(args.id)
sen_file = args.sentence
ids_file = args.ids
vec_file = args.vec
sif_file = args.sif

with open(sen_file, "r") as infile:
    sentences = json.load(infile)
with open(ids_file, "r") as infile:
    ids = json.load(infile)

vecs = None

sif = ResSIF()
loaded = sif.load_sif(sif_file)

if loaded:
    w_c = []
    sens = [sentences[i] for i in range(len(ids)) if ids[i] == inc_id]
    for w in sens[0]:
        w_c.append((w, sif.get_word_count(w)))

    w_c.sort(key=lambda u: u[1])
    for w in w_c:
        print("%-20s %d" % (w[0], w[1]))
else:
    for i in range(len(ids)):
        if ids[i] == inc_id:
            print(sentences[i])
Пример #8
0
class ResNLP(NLPWord2Vec):
    def __init__(self, inc_file=None, art_file=None, in_log=None):
        self.inc_file = inc_file
        self.art_file = art_file
        self.dataframe = None
        self.artifacts = None

        self.sif = ResSIF()
        self.log = in_log if in_log else logging.getLogger(__name__)
        self.inc_ids = []
        super(ResNLP, self).__init__()

    def load_data(self):
        """
        Template method to load data
        :return:
        """
        self.dataframe = pds.read_csv(
            self.inc_file,
            sep=',',
            usecols=["id", "name", "description", "resolution_summary"],
            skipinitialspace=True,
            quotechar='"')
        try:
            # The artifacts are fetched using /search_ex. Make sure it is there.
            if self.art_file:
                self.artifacts = json.load(open(self.art_file, "r"))
        except Exception as e:
            self.artifacts = None
            self.log.info("Failed to load artifact file: {}".format(
                self.art_file))

    def preprocess_data(self):
        """
        Template method to preprocess data
        :return:
        """
        self.dataset = []
        self.inc_ids = []
        word_utils = WordSentenceUtils()
        row_count = self.dataframe.shape[0]

        for index in range(row_count):
            row = self.dataframe.iloc[index]
            #
            #   Retrieve the name, description, and resolution_summary from an incident
            #
            sentence = str(
                row["name"]) + " " + str(row["description"] + " " +
                                         str(row["resolution_summary"]))
            #
            #   Retrieve the artifact value and description from an incident
            #
            inc_id = int(row["id"])
            if self.artifacts is not None:
                artifact_des = ResUtils.get_artifact_des(
                    inc_id, self.artifacts)
                sentence += artifact_des
            ws = word_utils.get_words(sentence)

            self.inc_ids.append(inc_id)
            self.dataset.append(ws)

    def build(self):
        """
        Build word2vec, sif,
        :return:
        """
        #
        #   Build gensim word2vec model
        #
        self.build_model()
        #
        #   Build SIF
        #
        self.sif.build_sif(self.dataset)

    def save(self, w2v_file=None, sif_file=None, s2v_file=None):
        """
        Save word2vec, sif
        :return:
        """
        #
        #   Save gensim.word2vec
        #
        w2vfile = w2v_file if w2v_file else FileManage.DEFAULT_NLP_FILE
        self.save_model(w2vfile)
        #
        #   Save SIF data
        #
        siffile = sif_file if sif_file else FileManage.DEFAULT_SIF_FILE
        self.sif.save_sif(siffile)
        #
        #   Save vec cache
        #
        s2vfile = s2v_file if s2v_file else FileManage.DEFAULT_VEC_FILE
        sen2vec = ResSen2Vec(w2v=self.word2vec, sif=self.sif, log=self.log)
        sen2vec.cache_sentence_vectors(self.dataset, self.inc_ids, s2vfile)