예제 #1
0
def process(sentence, doc_2,
            answer):  # Processes user input and outputs the correct response
    invalid_responses = [
        "I do not understand the question",
        "That question is not in my database", "I cannot answer "
        "that question", "I am not familiar with that question",
        "I am sorry. Could you please ask another question?"
    ]
    similarity_index = 0
    index = 0
    nlp = en_core_web_lg.load()
    doc_1 = nlp(preprocess(sentence))
    similarity = 0

    for i in range(len(doc_2)):
        if doc_2[i].vector_norm and doc_1.vector_norm:
            similarity = doc_1.similarity(doc_2[i])

        if similarity > similarity_index:
            similarity_index = similarity
            index = i
    if similarity_index > 0.60:
        return answer[index]

    else:
        return random.choice(invalid_responses)
예제 #2
0
def prepare_mag_data(base_dir):
    print("reading file")
    mag_file = os.path.join(base_dir, "mag_subset.txt")
    mag_df = pd.read_csv(mag_file, sep="\t")
    samples = []
    print("file read in")
    # prepare tokenization functions
    nlp=en_core_web_lg.load()
    tokenizer = Tokenizer(nlp.vocab)
    print("vocab loaded")
    #take samples with at least 10 words in citation context
    for index, row in mag_df.iterrows():
        context = row['citationcontext']
        text = re.sub("[" + re.escape(string.punctuation) + "]", " ", context)
        text = [token.lemma_ for token in tokenizer(text) if not token.like_num]
        text = [token for token in text if token.strip()]
        if(len(text) < MIN_CONTEXT_LENGTH):
            continue
        # generate sample in correct format
        sample = {"context": context,
                  "authors_citing": row['citingauthors'],
                  "title_cited": row['citedtitle'],
                  "authors_cited": row['citedauthors'],
                  "year": row['year']
                  }
        samples.append(pd.DataFrame(sample, index=[0]))
    print("processing done")
    logger.info("mag samples ready to load to file...")
    
    dataset = pd.concat(samples, axis=0)
    save_path = os.path.join(base_dir, "mag_data.csv")
    
    dataset.to_csv(save_path, sep="\t", compression=None, index=False, index_label=False)
    print("done")
예제 #3
0
def calculate_similarity(src_files, bug_reports):

    # Loading word vectors
    nlp = en_core_web_lg.load()

    src_docs = [
        nlp(' '.join(src.file_name['unstemmed'] +
                     src.class_names['unstemmed'] +
                     src.attributes['unstemmed'] + src.comments['unstemmed'] +
                     src.method_names['unstemmed']))
        for src in src_files.values()
    ]

    min_max_scaler = MinMaxScaler()

    all_simis = []
    for report in bug_reports.values():
        report_doc = nlp(' '.join(report.summary['unstemmed'] +
                                  report.pos_tagged_description['unstemmed']))
        scores = []
        for src_doc in src_docs:
            simi = report_doc.similarity(src_doc)
            scores.append(simi)

        scores = np.array([float(count) for count in scores]).reshape(-1, 1)
        normalized_scores = np.concatenate(
            min_max_scaler.fit_transform(scores))

        all_simis.append(normalized_scores.tolist())

    return all_simis
예제 #4
0
def vectorizer(question):  # Turns questions into a vectorized list
    question_list = []
    nlp = en_core_web_lg.load()
    for index in range(len(question)):
        vectorized_question = nlp(preprocess(question[index]))
        question_list.append(vectorized_question)
    return question_list
예제 #5
0
def clean_mag_data(dataframe, save_path):

    samples = []
    
    # prepare tokenization functions
    #nlp = spacy.load("en_core_web_lg") ---------didnt work??
    nlp=en_core_web_lg.load()
    tokenizer = Tokenizer(nlp.vocab)
    
    #take samples with at least 10 words in citation context
    for index, row in dataframe.iterrows():
        context = row['context']
        text = re.sub("[" + re.escape(string.punctuation) + "]", " ", context)
        text = [token.lemma_ for token in tokenizer(text) if not token.like_num]
        text = [token for token in text if token.strip()]
        if(len(text) < MIN_CONTEXT_LENGTH):
            continue
        # generate sample in correct format
        sample = {"context": context,
                  "authors_citing": row['authors_citing'],
                  "title_cited": row['title_cited'],
                  "authors_cited": row['authors_cited']}
        samples.append(pd.DataFrame(sample, index=[0]))
    
    logger.info("mag samples ready to load to file...")
        
    dataset = pd.concat(samples, axis=0)
    dataset.to_csv(save_path, sep="\t", compression=None, index=False, index_label=False)
예제 #6
0
def semantic_similarity(word_set):
  similar = en_core_web_lg.load(); 
  word_vec = list();
  for word in word_set:
    word_vec.append(word);
  word_similarity = dict();
  for i in range(0,len(word_vec)):
    word_vec[i] = similar(word_vec[i]);
  #print(word_vec)
  for i in range(0,len(word_vec)):
    for j in range(0,i):
      #word_vec[i] = (str)(word_vec[i]);
      #word_vec[j] = (str)(word_vec[j]);
      similarity = word_vec[i].similarity(word_vec[j]);
      #print(word_vec[i],word_vec[j],similarity)
      if(similarity>=0.8):
        if(word_vec[i] in word_similarity):
          word_similarity[word_vec[i]].append(word_vec[j]);
        else:
          word_similarity[word_vec[i]] = list();
          word_similarity[word_vec[i]].append(word_vec[j]);
        if(word_vec[j] in word_similarity):
          word_similarity[word_vec[j]].append(word_vec[i]);
        else:
          word_similarity[word_vec[j]] = list();
          word_similarity[word_vec[j]].append(word_vec[i]);        
  return word_similarity
class SpacyEntityExtractor:
    nlp = en_core_web_lg.load()
    accepted_entity_types = ['PERSON', 'GPE', 'ORG', 'Product', 'Event']

    def process_item(self, item, _):
        text = item.get('text')

        doc = self.nlp(text)

        entities_in_doc = {t: [] for t in self.accepted_entity_types}
        for entity in doc.ents:
            label = entity.label_
            if label not in entities_in_doc.keys():
                continue
            else:
                entities_in_doc[label].append({
                    'spacy_entity':
                    entity.orth_,
                    'spacy_position_start':
                    entity.start_char,
                    'spacy_position_end':
                    entity.end_char,
                })

        item['spacy_entities'] = entities_in_doc

        return item
def main():
    args = parse()
    run = check_if_should_run(args)

    if run == True:
        nlp = en_core_web_lg.load()

        path = "../lyrics/" + args.dataset + "/"

        if args.dataset != "GBDS":
            data = import_artist_files(path)
            datas = split_train_dev_test(data)
            datas = {
                data_type: form_x_of_songs_and_verses(data)
                for data_type, data in datas.items()
            }
        else:
            datas = create_datasets_for_GBDS(path)

        if "CADS" in args.dataset:
            datas = {'train': datas['train'] + datas['dev'] + datas['test']}

            data = import_duo_artist_file(path)
            data = convert_to_verse_classification_duo_artist(data)
            data = preprocess(data, nlp)
            write_to_csv(data, "test", args.dataset)
        datas = {
            data_type: preprocess(data, nlp)
            for data_type, data in datas.items()
        }
        for data_type, data in datas.items():
            write_to_csv(data, data_type, args.dataset)
        print("Succesfully preprocessed all lyrics to .../datasets/" +
              args.dataset + "_'train/dev/test'.csv")
예제 #9
0
def tokenize_ft_extraction(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    """
    This function takes in a dataframe and column name for text
    and performs the following to the text:
    remove punctuation;
    change to lower cases;
    tokenization, including tokenize numbers as [NUM];
    lemmatization


    Args:
        df (pd.DataFrame): dataframe to be transformed
        col_name (str): column name of the text

    Returns:
        pd.DataFrame: transformed dataframe
    """
    punctuation = string.punctuation
    df[col_name] = df[col_name].str.replace('[' + punctuation + ']',
                                            '',
                                            regex=True)
    df[col_name] = df[col_name].str.lower().str.strip()
    nlp = en_core_web_lg.load()
    nlp.add_pipe(merge_entities)
    df[col_name] = df[col_name].apply(_regex_clean)
    lemmatized_text = []
    df['ents_rep'] = None
    df['vocab'] = None
    df['ppo_rep'] = None
    df['no_ents_text'] = None
    for idx, text in enumerate(df[col_name]):
        doc = nlp(text)
        tokens = []
        ents = []
        texts = []
        ppo = []
        # places = []
        # persons = []
        # orgs = []
        for token in doc:
            if token.lemma_ == '-PRON-':
                tokens.append(token.text)
            elif not token.ent_type_:
                texts.append(token.text)
                tokens.append(token.lemma_)
            else:
                tokens.append(token.ent_type_)
                ents.append(token.text.lower())
                if token.ent_type_ in [*PERSON, *PLACE, *ORG]:
                    ppo.append(token.text.lower())
        lemmatized_text.append(tokens)
        df['ents_rep'][idx] = len(ents) / len(set(ents))
        df['vocab'][idx] = len(set(texts)) / len(texts)
        df['ppo_rep'][idx] = len(ppo) / (len(set(ppo)) + np.exp(float('-inf')))
        df['no_ents_text'][idx] = ' '.join(texts)

    df['lem_text'] = lemmatized_text
    df = _feature_extraction(df, 'lem_text')
    return df
예제 #10
0
def getModelWithAbbrQRAndSpeller():
    nlp = model_lg.load()
    return CliNlpModel("lgd_lgm_abbrqr_speller",
                       getIntentSet(),
                       nlp,
                       rewriteDataQuery=rewriteAbbrInQuery,
                       rewriteUserQuery=combineQueryRewriters(
                           [rewriteAbbrInQuery, correctSpellingErrors]))
  def load(self):
    if self._nlp is None:
      print("Create nlpWithAzureResourceRecognizer")
      nlp = model_lg.load()
      azureResourceRecognizer = AzureResourceRecognizer(nlp)
      nlp.add_pipe(azureResourceRecognizer, last=True)
      self._nlp = nlp

    return self._nlp
예제 #12
0
def main():
    news = 'Rancho Mirage, a 310-unit multifamily property located in the Las Colinas master-planned community, recently underwent $2 million in property improvements to overhaul units and amenities for 3 Columbus Circle.'
    nlp = en_core_web_lg.load()
    doc = nlp(news)
    for np in list(doc.noun_chunks):
        np.merge(np.root.tag_, np.root.lemma_, np.root.ent_type_)

    for ent in doc.ents:
        print(ent.text, ent.label, ent.lemma_, ent.root.ent_type)
예제 #13
0
def main():
    # load the GloVe model with 300 dimensions
    nlp = en_core_web_lg.load()
    with open("output/analogy.txt", 'w') as f:
        f.write(print_analogy("king", "man", "queen", nlp) + '\n')
        f.write(print_analogy("London", "England", "Paris", nlp) + '\n')
        f.write(print_analogy("Dog", "Puppy", "Cat", nlp) + '\n')
        f.write(print_analogy("Sister", "Brother", "Aunt", nlp) + '\n')
        f.write(print_analogy("Slow", "Slower", "Fast", nlp) + '\n')
예제 #14
0
def getModelWithAbbrQRAndSpeller():
    nlp = model_lg.load()
    return CliNlpModel("lgd_lgm_abbrqr_speller",
                       getAllAsQueries,
                       data.cliData,
                       nlp,
                       rewriteDataQuery=rewriteAbbrInQuery,
                       rewriteUserQuery=lambda q: correctSpellingErrors(
                           rewriteAbbrInQuery(q)))
def get_model():
    """ Lazy initializer of model """
    global model

    if model is None:
        print("Loading model...")
        model = en_core_web_lg.load()

    return model
예제 #16
0
def main(argv):
    start_time = time.time()

    # Config
    csv_file_name = 'user_queries.csv'
    data_target_name = 'query.pickle'
    x_index = 0
    y_index = 1
    keyword_index = 2
    # query_index = 16 # Let's take a query from the query file
    query_index = int(argv[0])
    keyword_delimiter = ' '
    csv_delimiter = ';'
    csv_quotechar = '"'
    file_allow_overwrite = True

    # Code
    if query_index <= 0:
        print('The query row has to be positive.')
    else:
        print('Loading CSV', csv_file_name)
        data = load_csv(file_name=csv_file_name,
                        x_coordinate_index=x_index,
                        y_coordinate_index=y_index,
                        keywords_index=keyword_index,
                        keywords_delimiter=keyword_delimiter,
                        delimiter=csv_delimiter,
                        quotechar=csv_quotechar,
                        max_read_length=query_index + 1,
                        query_load=True)
        if len(data) > 0:
            print('Query Datapoint:', data[query_index - 1].coordinates.x,
                  data[query_index - 1].coordinates.y,
                  data[query_index - 1].keywords)
            write_pickle(data=data[query_index - 1],
                         file_name=data_target_name,
                         file_allow_overwrite=file_allow_overwrite)

        else:
            print('Could not load any data.')
    nlp = en_core_web_lg.load()
    df_poi_encoded = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) +
                                 '/../files/' + 'poi_keywords_encoded.csv',
                                 index_col='poi_name',
                                 encoding='utf-8')
    # print(df_poi_encoded)
    for kw in data[query_index - 1].keywords:
        df_poi_encoded[kw] = df_poi_encoded.apply(
            lambda row: nlp(row['nlp_keywords_encoded']).similarity(nlp(kw)),
            axis=1)

    df_poi_encoded.to_csv(os.path.dirname(os.path.abspath(__file__)) +
                          '/../files/' + 'poi_queries_similarities.csv',
                          encoding='utf-8')

    print("--- %s seconds ---" % (time.time() - start_time))
예제 #17
0
def get_spacy_nlp():
    """Handles lazy loading of the Spacy NLP model.

    Returns: 
        The loaded Spacy model.
    """
    global _nlp
    if _nlp is None:
        _nlp = en_core_web_lg.load()
    return _nlp
예제 #18
0
def wordEmbedding(
    question
):  #change all questions in the corpora to vectors and store in a list
    embeddingList = []
    nlp = en_core_web_lg.load()

    for x in range(len(question)):
        doc = nlp(preprocess(question[x]))
        embeddingList.append(doc)
    return embeddingList
예제 #19
0
def getModelWithAzureResourceRecognizer():
    nlp = model_lg.load()
    intentSet = getIntentSet()
    azureResourceRecognizer = AzureResourceRecognizer(nlp)
    nlp.add_pipe(azureResourceRecognizer, last=True)
    return CliNlpModel("lgd_lgm_azRecognizer",
                       intentSet,
                       nlp,
                       rewriteAbbrInQuery,
                       preProcessDoc=updateDocVector)
예제 #20
0
def wordEmbedding(question): #change all questions in the corpora to vectors and store in a list
    embeddingList=[]
    nlp = en_core_web_lg.load()

    for x in range(len(question)):
        doc=nlp(preprocess(question[x]))
        pre=[doc]
        pre.append(prep.findsenti(question[x]))#also include the sentiment 
        embeddingList.append(pre)
    return embeddingList
예제 #21
0
def getModelWithAbbrQrStopsQrAndSpeller():
    nlp = model_lg.load()
    intentSet = getIntentSet()
    return CliNlpModel("lgd_lgm_abbrQrStopsQrAndSpeller",
                       intentSet,
                       nlp,
                       rewriteDataQuery=combineQueryRewriters(
                           [rewriteAbbrInQuery, rewriteStopWords]),
                       rewriteUserQuery=combineQueryRewriters([
                           rewriteAbbrInQuery, rewriteStopWords,
                           correctSpellingErrors
                       ]))
def generate(intputSen, doc2, answer):
    index = 0
    nlp = en_core_web_lg.load()
    doc1 = nlp(preprocess(intputSen))
    inputsenti = prep.findsenti(intputSen)
    similarity = 0
    bestlist = []

    for x in range(len(doc2)):
        if doc2[x][0].vector_norm and doc1.vector_norm:
            similarity = doc1.similarity(
                doc2[x][0]
            )  #compare the input sentence and questions stored in the list

        if similarity > 0.60:
            # this is the threshold, so if this value is too high, then your input must
            #have a higher degree of similarity to the questions in the corpora
            index = x
            bestlist.append([similarity, index, doc2[x][1]])

    if len(bestlist) == 0:
        # at least 5 different  reasonable responses when the user enters something outside the two topics
        listReply = [
            'Sorry your question is not included in my database',
            'Sorry, I do not know how to reply that',
            'Whoops! my brain is dead, may be next question',
            'Pass that bro, I cannot remember',
            'This question is too difficult, next question please',
            'Your question is hard for me, sorry abour that'
        ]
        replyOutsideTopic = random.choice(listReply)
        print(replyOutsideTopic)
        return replyOutsideTopic
    sortedanswer = sorted(bestlist, key=operator.itemgetter(0))

    if len(sortedanswer) == 1:
        print(answer[sortedanswer[0][1]])
        return answer[sortedanswer[0][1]]
    else:
        if sortedanswer[-1][0] != sortedanswer[-2][0]:
            print(answer[sortedanswer[-1][1]])
            return answer[sortedanswer[-1][1]]
        else:
            if abs(sortedanswer[-1][2] - inputsenti) > abs(
                    sortedanswer[-2][2] - inputsenti
            ):  #if top 2 answer have same similarity, then check the sentiment.
                print(answer[sortedanswer[-2][1]])
                return answer[sortedanswer[-2][1]]
            else:
                print(answer[sortedanswer[-1][1]])
                return answer[sortedanswer[-1][1]]
예제 #23
0
def train(train_df,
          dev_df,
          lstm_shape,
          lstm_settings,
          model_name,
          batch_size=100,
          nb_epoch=15):
    logger.info("Loading spaCy")

    nlp = en_core_web_lg.load()  # spacy.load('en_vectors_web_lg')
    nlp.add_pipe(nlp.create_pipe('sentencizer'))

    embeddings = get_embeddings(nlp.vocab)
    if model_name == "lstm":
        model = compile_lstm(embeddings, lstm_shape, lstm_settings)
    elif model_name == "lstm_with_attention":
        model = compile_lstm_attention(embeddings, lstm_shape, lstm_settings)
    else:  # model_name == "lstm_with_visualization":
        model = compile_visualizable_lstm_attention(embeddings, lstm_shape,
                                                    lstm_settings)

    tensorboard_dir = os.path.join(logdir, "checkpoints")
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tbCallBack = TensorBoard(log_dir=tensorboard_dir,
                             histogram_freq=0,
                             write_graph=True)

    logger.info("Start training...")
    model.fit_generator(
        DataGenerator(train_df,
                      nlp=nlp,
                      batch_size=batch_size,
                      max_sentences=lstm_shape["max_sentences"],
                      max_sentence_length=lstm_shape["max_sentence_length"],
                      shuffle=True),
        validation_data=DataGenerator(
            dev_df,
            nlp=nlp,
            batch_size=batch_size,
            max_sentences=lstm_shape["max_sentences"],
            max_sentence_length=lstm_shape["max_sentence_length"],
            shuffle=False),
        steps_per_epoch=int(np.floor(len(train_df) / batch_size)),
        validation_steps=int(np.floor(len(dev_df) / batch_size)),
        nb_epoch=nb_epoch,
        callbacks=[tbCallBack])

    return model, nlp
예제 #24
0
def main():
    # read in human judgement scores
    fname = "data/wordsim-353.txt"
    df = read_human_judgements(fname)

    # load the GloVe model with 300 dimensions
    nlp = en_core_web_lg.load()

    # determine similarity scores using cosine similarity of embeddings
    df['score_embeddings'] = df.apply(
        lambda row: similarity_score(row['word1'], row['word2'], nlp), axis=1)
    # write results to file
    outfile = "output/word_similarity.txt"
    generate_output(df, outfile)
예제 #25
0
def annotate_DEP(dataset_file):
    TRAIN_DATA = []
    model = en_core_web_lg.load()
    with open(dataset_file, 'r') as cve_dataset_f:
        cve_reader = csv.DictReader(cve_dataset_f, delimiter=';')
        for cve in cve_reader:
            tagged_desc = model(unicode(cve['Avail.']))
            heads = [tok.head.i for tok in tagged_desc]
            deps = ['-'] * len(heads)
            TRAIN_DATA += [[cve['Avail.'], {'heads': heads, 'deps': deps}]]
    with open(
            'annotated_{}_DEP_train.json'.format(
                dataset_file.replace('.csv', '')), 'w') as annotated_f:
        json.dump(TRAIN_DATA, annotated_f)
예제 #26
0
def get_people_orgs_batch(court="Chancery",
                          jx="Delaware",
                          model="large",
                          write=True,
                          overwrite=False):
    """Gets the people and orgs from opinions for an entire court at a time.
        Options allow saving to the database."""

    bigDict = {}

    db, ct = check_court_jx(court, jx)
    if not db: return
    docs = db.objects.filter(Court__exact=ct)
    print("Total records:", len(docs))

    print("Done: ", end="")

    for i, doc in enumerate(docs):

        if (i % 10 == 0): print(i, ". . . ", end="")

        #INITIAL CONDITION: IF OVERWRITE IS NOT SET
        if write and (not overwrite):
            if not ((doc.People == None) or (doc.People == "")) and\
                   ((doc.Organizations == None) or (doc.Organizations == "")):
                continue

        text = doc.MainText
        clean_text = clean_MT(text)
        cites = getDocCites(clean_text)
        full_cites = [getWholeCite(cite, clean_text) for cite in cites]
        processed_text = removeCites(clean_text, full_cites)

        nlp = en_core_web_lg.load()
        people, orgs = people_orgs_batchNLP(processed_text, nlp)
        people = [p for p, _ in people]
        orgs = [o for o, _ in orgs]
        docID = doc.id

        if write:
            doc.People = json.dumps(people)
            doc.Organizations = json.dumps(orgs)
            doc.save()

        newDict = {"people": people, "orgs": orgs}
        bigDict[docID] = newDict

    print("Done")
    return bigDict
예제 #27
0
 def criar_tabela_associacao_frases_recentes_com_tabela_de_frases_agrupada_usando_linguagem_natural_como_criterio(self):
     print("criar_tabela_associacao_frases_recentes_com_tabela_de_frases_agrupada_usando_linguagem_natural_como_criterio")
     self.nlp = en_core_web_lg.load()
     repositorio = PostagensRepository()
     frases_recentes = repositorio.listar_frases_recentes()
     frases = repositorio.listar_frases_com_tendencia()
     resultado = list()
     for frase in frases:
         frase_equivalente :str= self.equivalencia_semantica(frase, frases_recentes)
         if frase_equivalente!=None:
             nova = dict()
             nova["frase_recente"] = frase_equivalente
             nova["frase"]= frase
             resultado.append(nova)
     repositorio.insere_tabela_associacoes(resultado, "Equivalencia Semantica")
예제 #28
0
def get_most_similar_item(item, old_items):
    """
    given name of new item, and list of names of old items, returns the index of
    the most similar item
    """
    nlp = en_core_web_lg.load()
    item_token = nlp(item.lower().replace('_', ' '))
    similarities = []
    for old_item in old_items:
        similarities.append(
            item_token.similarity(nlp(old_item.lower().replace('_', ' ')))
            )

    similar_index = similarities.index(max(similarities))
    # similar_name = old_items[similar_index]
    return similar_index
예제 #29
0
    def categorize_jobs(self):
        # #Predefined categories
        #Compare similarities of word embeddings
        #nlp=spacy.load('en_core_web_lg')
        nlp=en_core_web_lg.load()
        job_id=self.df2.loc[:,'Job_Id'].tolist()[:self.training_range]
        job_titles=self.df2.loc[:,'jobtitle'].tolist()[:self.training_range]
        job_descriptions=self.df2.loc[:,'jobdescription'].tolist()[:self.training_range]
        final_cat=pd.DataFrame(index=job_id)
        #categories=['Network Engineer','Application Development','Big Data','Data Analyst','Software Developer','DevOps','Software Testing','Front End','Back End','Full Stack','Web Development','Information Security','Mobile developer','System Administrator','Business Analyst','Manager','Cloud']
        categories=['Network Engineer','Full stack','QA/Test Developer','Enterprise application','DevOps','Mobile Developer','Back End','Database Administrator(DBA)','Front End','Game developer','System Administrator','Data Scientist','Business analyst','Sales professional','Product Manager','Information Security','Software Developer/Java Developer','Web Developer','Cloud Computing']
        for category in categories:
            final_cat[category]=np.nan
        for job_t_d in list(zip(job_id,job_titles,job_descriptions)):
            id_job=job_t_d[0]
            job_i=job_t_d[1]
            job_d=job_t_d[2]
            job_title=nlp(job_i.lower())
            job_description=nlp(job_d.lower())
            match_cat_title=dict()
            match_cat_description=dict()
            for category in categories:
                word=nlp(category.lower())
                match_cat_title[category]=job_title.similarity(word)
                match_cat_description[category]=job_description.similarity(word)
            match_cat_title=sorted(match_cat_title.items(),key=lambda x:x[1],reverse=True)
            match_cat_description=sorted(match_cat_description.items(),key=lambda x:x[1],reverse=True)


            #a represents max
            #if(match_cat_title[0][1]>0.5 or match_cat_description[0][1]>0.5):
            a=match_cat_title[0]
            #print(a)
            match_cat_description=list(filter(lambda x: self.check_threshold(match_cat_title,x),match_cat_description))
            if(len(match_cat_description)!=0):
                print(match_cat_description)
                print(id_job)
                #b=match_cat_description[0]
                final_cat.loc[id_job,a[0]]=1
                match_cat_description.extend([(match_cat_title[0][0],1)])
                sum_proportion=sum([x[1] for x in match_cat_description])
                for ele in match_cat_description:
                    final_cat.loc[id_job,ele[0]]=ele[1]/sum_proportion
            else:
                print(id_job)
                final_cat.loc[id_job,a[0]]=1
        return final_cat
예제 #30
0
    def __init__(self):
        """Constructor
        Load spacy model once
        """

        # Set log level
        loglevel = os.environ.get("LOG_LEVEL", "INFO")
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(loglevel)
        logging.getLogger('tldextract').setLevel(loglevel)

        # Caching top level domains
        tldextract.extract("")

        # Load spaCy lg model
        self.logger.info("Loading NLP model...")
        self.nlp = en_core_web_lg.load(disable=['parser', 'tagger'])