Exemplo n.º 1
0
 def init_processor(self, yanwenzi_dict):
     actree = KeywordProcessor()
     for word, r_word in yanwenzi_dict.items():
         actree.add_keyword(word, '_{}_'.format(r_word))  # 向trie树中添加单词
     print('success loaded. keyword num : ', len(actree))
     return actree
Exemplo n.º 2
0
def getYelp3Words(yelpScraperResult):
    df = yelpScraperResult

    def customtokensize(text):
        return re.findall("[\w']+", str(text))

    df['tokenized_text'] = df[1].apply(customtokensize)
    stopwords = [
        'and', 'was', 'were', 'had', 'check-in', '=', '= =', 'u', 'want',
        'u want', 'cuz', 'him', "i've", 'on', 'her', 'told', 'ins', '1 check',
        'I', 'i"m', 'i', ' ', 'it', "it's", 'it.', 'they', 'the', 'this',
        'its', 'l', 'they', 'this', "don't", 'the ', ' the', 'it', 'i"ve',
        'i"m', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '/', '.',
        ','
    ]

    def filter_stopwords(text):
        nonstopwords = []
        for i in text:
            if i not in stopwords:
                nonstopwords.append(i)
        return nonstopwords

    df['tokenized_text'] = df['tokenized_text'].apply(filter_stopwords)
    df['parts_of_speech_reference'] = df['tokenized_text'].apply(
        filter_stopwords)
    df['parts_of_speech_reference'] = df['parts_of_speech_reference'].str.join(
        ' ')

    def find_noun_noun(x):
        noun_list = []
        doc = nlp(str(x))
        try:
            for token in range(len(doc)):
                sub_list = []
                if doc[token].pos_ == 'NOUN' and doc[token + 1].pos_ == 'NOUN':
                    sub_list.append(doc[token - 1])
                    sub_list.append(doc[token])
                    sub_list.append(doc[token + 1])
                if len(sub_list) != 0 and sub_list not in noun_list:
                    noun_list.append(sub_list)
        except IndexError as e:
            pass
        return noun_list

    def find_adj_noun(x):
        adj_noun_list = []
        doc = nlp(str(x))
        try:
            for token in range(len(doc)):
                sub_list = []
                if doc[token].pos_ == 'ADJ' and doc[token + 1].pos_ == 'NOUN':
                    sub_list.append(doc[token - 1])
                    sub_list.append(doc[token])
                    sub_list.append(doc[token + 2])
                if len(sub_list) != 0 and sub_list not in adj_noun_list:
                    adj_noun_list.append(sub_list)
        except IndexError as e:
            pass
        return adj_noun_list

    def find_the(x):
        the_list = []
        doc = nlp(str(x))
        try:
            for token in range(len(doc)):
                sub_list = []
                if doc[token].text == 'the' or doc[token].text == 'a' or doc[
                        token].text == 'an':
                    sub_list.append(doc[token + 1])
                    sub_list.append(doc[token + 2])
                    sub_list.append(doc[token + 3])
                    # sub_list.append(doc[token+4])
                if len(sub_list) != 0 and sub_list not in the_list:
                    the_list.append(sub_list)
        except IndexError as e:
            pass
        return the_list

    df['word_segments_nn'] = df['parts_of_speech_reference'].apply(
        find_noun_noun)
    df['word_segments_adjn'] = df['parts_of_speech_reference'].apply(
        find_adj_noun)
    df['word_segments_the'] = df['parts_of_speech_reference'].apply(find_the)

    noun_noun_phrases = []
    string = ''
    for i in df['word_segments_nn']:
        for x in i:
            string = ' '.join([str(elem) for elem in x])
            noun_noun_phrases.append(string)
    adj_noun_phrases = []
    for i in df['word_segments_adjn']:
        for x in i:
            string = ' '.join([str(elem) for elem in x])
            adj_noun_phrases.append(string)
    the_phrases = []
    for i in df['word_segments_the']:
        for x in i:
            string = ' '.join([str(elem) for elem in x])
        the_phrases.append(string)

    all_phrases = noun_noun_phrases + adj_noun_phrases + the_phrases

    class FlashTextExtact(st.FeatsFromSpacyDoc):
        '''
        '''
        def set_keyword_processor(self, keyword_processor):
            '''
            :param keyword_processor: set, phrases to look for
            :return: self
            '''
            self.keyword_processor_ = keyword_processor
            return self

        def get_feats(self, doc):
            '''
            Parameters
            ----------
            doc, Spacy Doc
            Returns
            -------
            Counter noun chunk -> count
            '''
            return Counter(self.keyword_processor_.extract_keywords(str(doc)))

    keyword_processor = KeywordProcessor(case_sensitive=False)

    for phrase in all_phrases:
        keyword_processor.add_keyword(phrase)
    feature_extractor = FlashTextExtact().set_keyword_processor(
        keyword_processor)

    df['parse'] = df['parts_of_speech_reference'].apply(
        st.whitespace_nlp_with_sentences)
    corpus = (st.CorpusFromPandas(
        df,
        category_col=2,
        text_col='parts_of_speech_reference',
        nlp=st.whitespace_nlp_with_sentences,
        feats_from_spacy_doc=feature_extractor).build())

    term_freq_df = corpus.get_term_freq_df()
    term_freq_df['highratingscore'] = corpus.get_scaled_f_scores(
        '5.0 star rating')

    term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores(
        '1.0 star rating')
    dh = term_freq_df.sort_values(by='highratingscore', ascending=False)
    dh = dh[['highratingscore', 'poorratingscore']]
    dh = dh.reset_index(drop=False)
    dh = dh.rename(columns={'highratingscore': 'score'})
    dh = dh.drop(columns='poorratingscore')
    positive_df = dh.head(10)
    negative_df = dh.tail(10)
    results = {
        'positive': [{
            'term': pos_term,
            'score': pos_score
        } for pos_term, pos_score in zip(positive_df['term'],
                                         positive_df['score'])],
        'negative': [{
            'term': neg_term,
            'score': neg_score
        } for neg_term, neg_score in zip(negative_df['term'],
                                         negative_df['score'])]
    }
    return results
Exemplo n.º 3
0
jinan = ['商河县', '济阳县', '平阴县']
heze = ['定陶县', '单县', '成武县', '东明县', '曹县', '郓城县', '鄄城县', '巨野县']
fubo = ['桓台', '高青', '沂源']
dying = ['利津县', '垦利县', '广饶县', '东营县']
yantai = ['长岛县']
weifang = ['青州县', '诸城县', '寿光县', '安丘县', '高密县', '昌邑县']
jinin = ['微山县', '鱼台县', '金乡县', '嘉祥县', '汶上县', '泗水县', '梁山县']
taian = ['东平县', '宁阳县']
rizhao = ['五连县', '莒县']
binzhou = ['邹平县', '博兴县', '惠民县', '沾化县', '无棣县', '阳信县']
dezhou = ['齐河县', '平原县', '夏津县', '武城县', '陵县', '临邑县', '宁津县', '庆云县']
liaocheng = ['茌平县', '高唐县', '阳谷县', '东阿县', '莘县', '冠县']
lingxi = ['沂水县', '沂南县', '平邑县', '蒙阴县', '费县', '郯城县', '苍山县', '莒南县', '临沭县']

for each2 in dict1:
    keyword.add_keyword(each2)


def Get_html(Name):
    async def fetch(session, url):
        async with session.get(url) as response:
            return await response.text()

    async def look():
        async with aiohttp.ClientSession() as session:
            html = await fetch(
                session, 'https://www.bing.com/search?q=intitle:%s' % Name)
            Get_N_url(Name, html)

    if __name__ == '__main__':
        loop = asyncio.get_event_loop()
    MISSPELL_PATH = open('./preprocessing/mispell.txt', 'r')
    mispell_dict = dict()
    for misspell_word in MISSPELL_PATH:
        words = misspell_word.split(',')
        mispell_dict[words[0]] = words[1].strip(' |\n')

    mix_mispell_dict = {}
    for k, v in mispell_dict.items():
        mix_mispell_dict[k] = v
        mix_mispell_dict[k.lower()] = v.lower()
        mix_mispell_dict[k.upper()] = v.upper()
        mix_mispell_dict[k.capitalize()] = v.capitalize()

    kp = KeywordProcessor(case_sensitive=True)
    for k, v in mix_mispell_dict.items():
        kp.add_keyword(k, v)

    sentences = [
        'motherfuckeraif this is a test ! ', 'i am a robot',
        'i want to silver', 'this is a test, trumpdon !, trumpland, sallary'
    ]
    sentences = pd.DataFrame(sentences, columns=['comment_text'])
    print('this is a test case sentence !')
    print('clean text')
    sentences = sentences['comment_text'].apply(
        lambda sentence: content_preprocessing(sentence, kp))

    print(sentences)
    print('---' * 30)

    print('lstm')
Exemplo n.º 5
0
def drop_ngrams_with_keyword(texts: Collection[str], term: str) -> Collection:
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keyword(term)

    return texts
Exemplo n.º 6
0
nlp = spacy.load("en_core_web_sm")
#for i in range(200,250):
#    print(i)
#    s = job_data[i].replace("\\r\\n"," ").replace("\\t","")
#    qual = get_qualification(s)
#    imp_entities=get_Relations([qual],nlp,"QUANTITY") #PERSON #FAC #
#    print(imp_entities)
#print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))

# In[53]:

keyword_processorTitle = KeywordProcessor()
s = job_data[548].replace("\\r\\n", " ").replace("\\t", "")
for title in title_text:
    title = title.strip()
    keyword_processorTitle.add_keyword(title)

# In[58]:

for i in range(500, 510):
    s = job_data[i].replace("\\r\\n", " ").replace("\\t", "")
    keywords_found = keyword_processorTitle.extract_keywords(s)
    print(i)
    print(get_position(s))
    if (len(keywords_found) > 0):
        for j in range(len(keywords_found)):
            if (j > 0):
                if ((keywords_found[j] != get_position(s)) &
                    (keywords_found[j] != keywords_found[j - 1])):
                    print("Previous position:")
                    print(keywords_found[j])
Exemplo n.º 7
0
def main(name):
    keyword_processor = KeywordProcessor()

    def convert_list_to_string(org_list, seperator=' '):

        return seperator.join(org_list)

    metinler = [
        'ne', 'zaman'
    ]  # bu şuan sadece "ne zaman" ile sorulan sorulara cevap verebiliyor
    #kaynak:https://python.yemreak.com/temel/string-islemleri

    g = requests.get("https://www.google.com/")
    gugıl = BeautifulSoup(g.content)
    print(gugıl.find_all("gLFyf gsfi"))

    soru = name  # girilen string ifadeyi google da aratıyor
    # ne zaman sorusuna cevap
    #-------------------------------------------------------------------------------------------------------------------------------------------------------
    if (
            all(metin in soru for metin in metinler)
    ):  #sorulan soruda ne zaman yerine cevap olabilecek kelimeleri değiştiriyor

        tarihleriarasında = "tarihleri arasında"
        result1 = re.sub(r"ne zaman", tarihleriarasında,
                         soru)  #stringlerde replace metodu da olabilir
        #kaynak:https://stackabuse.com/using-regex-for-text-manipulation-in-python/

    if (all(metin in soru for metin in metinler)):

        tarihinde = "tarihinde"
        result2 = re.sub(r"ne zaman", tarihinde, soru)
        #DÜZENLENECEK

    if (all(metin in soru for metin in metinler)):

        saatlerinde = "saatlerinde"
        result2 = re.sub(r"ne zaman", saatlerinde, soru)
        #DÜZENLENECEK
    if (all(metin in soru for metin in metinler)):

        saatleriarasında = "saatleri arasında"
        result2 = re.sub(r"ne zaman", saatleriarasında, soru)
    if (all(metin in soru for metin in metinler)):

        kadardevamedecek = "kadar devam edecek"
        result2 = re.sub(r"ne zaman", saatleriarasında, soru)
        #DÜZENLENECEK

    # bulunabiliecek ifademiz
    #print(result1)
    #print(result2)

    #-------------------------------------------------------------------------------------------------------------------------------------------------------
    try:
        liste = result1.split(tarihleriarasında)  #DÜZENLENECEK
        stringifade1 = convert_list_to_string(
            liste)  # burada split ettiğimiz liste değişkenini stringe çevirdik
    except:
        print("bu ifade bulunamıyor")
    #şuan stringifade1 kullanılmıyor ama ilerde kullanılacak
    #kaynak:https://thispointer.com/python-how-to-convert-a-list-to-string/#:~:text=Convert%20list%20to%20string%20in%20python%20using%20join()%20in,a%20function%20join()%20i.e.&text=join()%20function%20accepts%20an,it%20returns%20the%20concatenated%20string.

    #ilerde url kısmını otomatik olarak alacağız (RPA ile çözülebilir)
    for j in search(soru, tld="co.in", num=10, stop=1, pause=2):
        url = j  #burada en başta sorulan soru sonucunda gelen ilk urlyi aldım
        #kaynak:https://www.geeksforgeeks.org/performing-google-search-using-python-code/

    r = requests.get(url)

    #r= requests.get("https://www.hurriyet.com.tr/galeri-hafta-sonu-sokaga-cikma-yasagi-ne-zaman-saat-kacta-bitiyor-23-25-nisan-sokaga-cikma-yasagi-saatleri-41795118/3")
    r.content
    # kaynak:https://docs.python-requests.org/en/master/
    soup = BeautifulSoup(r.content)
    soup.prettify(
    )  #burada web sitesinden aldığımız verileri düzenliyoruz ama çok gerek yok
    textamastringdegil = soup.find_all()  #tüm siteyi taradı
    #kaynak:https://www.crummy.com/software/BeautifulSoup/bs4/doc/
    text = str(textamastringdegil)  # ve text değişkenine koyduk

    #-------------------------------------------------------------------------------------------------------------------------------------------------------
    if re.search(
            tarihleriarasında, text
    ):  #kaynak:https://stackabuse.com/using-regex-for-text-manipulation-in-python/
        print("Match found")
    else:
        print("Match not found")

    if re.search(tarihinde, text):
        print("Match found")
    else:
        print("Match not found")

    if re.search(saatlerinde, text):
        print("Match found")
    else:
        print("Match not found")

    if re.search(saatleriarasında, text):
        print("Match found")
    else:
        print("Match not found")
    if re.search(kadardevamedecek, text):
        print("Match found")
    else:
        print("Match not found")

    #-------------------------------------------------------------------------------------------------------------------------------------------------------
    nezamanakarsilik = [
        tarihleriarasında, tarihinde, saatlerinde, saatleriarasında,
        kadardevamedecek
    ]
    for x in nezamanakarsilik:
        keyword_processor.add_keyword(
            x
        )  #string 3 ü textin içinde arıyor ve bulduğu zaman belli bir aralığı veriyor
        keywords_found = keyword_processor.extract_keywords(text,
                                                            span_info=True)
        #kaynak:https://flashtext.readthedocs.io/en/latest/

        try:
            aralık = keywords_found[0]  # try ex koy

            print(
                text[(aralık[1] - 34):aralık[2]]
            )  # bu kısım soru  zarfına cevap olabilecek aralığı çıktı olarak veriyor
            print(str(aralık[1]))
            print(str(aralık[2]))
            son = text[(aralık[1] - 34):aralık[2]]
            # son olarak soruya cevap olabilecek sayırları verdi
        except IndexError:
            print(x + "-> bu kelime internet sitesinde yer almıyor")
            son = "-> bu kelime internet sitesinde yer almıyor"

    return son
Exemplo n.º 8
0
def find_educational_qualifications(text):
    keywords_deg = KeywordProcessor()

    diploma_list = ["diploma"]
    bachelor_list = ["bachelor", "bachelor's", "bachelors"]
    master_list = ["master", "master's", "masters"]
    phd_list = ["phd", "ph d", "p.h.d", "ph.d", "ph. d", "doctorate"]

    for item in diploma_list:
        keywords_deg.add_keyword(item, ("Diploma", "NA"))

    for item in bachelor_list:
        keywords_deg.add_keyword(item, ("Bachelor", "NA"))

    for item in master_list:
        keywords_deg.add_keyword(item, ("Master", "NA"))

    for item in phd_list:
        keywords_deg.add_keyword(item, ("PhD", "NA"))

    identified_edu = keywords_deg.extract_keywords(text, span_info=True)

    edu_list = []
    for item in identified_edu:
        start_pos = item[2] + 1
        end_pos = start_pos + 100
        t_len = len(text)
        if end_pos > t_len:
            end_pos = t_len
        tmp_str = text[start_pos:end_pos].lower()
        discipline_list = find_discipline(tmp_str)
        edu_list.append((item[0] + (" ".join(discipline_list),)))

    b_reg = re.compile(r"\bB[.]?\s?[A-Z][A-Z]?[a-z]*\b")
    b_list = [(i.group(), i.span()) for i in b_reg.finditer(text)]

    b_list_full = []
    for item in b_list:
        start_pos = item[1][1] + 1
        end_pos = start_pos + 100
        t_len = len(text)
        if end_pos > t_len:
            end_pos = t_len
        tmp_str = text[start_pos:end_pos].lower()
        discipline_list = find_discipline(tmp_str)

        b_list_full.append(("Bachelor", item[0].replace('.', '').replace(' ', ''), " ".join(discipline_list)))

    m_reg = re.compile(r"\bM[.]?\s?[A-Z][A-Z]?[a-z]*\b")
    m_list = [(i.group(), i.span()) for i in m_reg.finditer(text)]

    m_list_full = []
    for item in m_list:
        start_pos = item[1][0] + 1
        end_pos = start_pos + 100
        t_len = len(text)
        if end_pos > t_len:
            end_pos = t_len
        tmp_str = text[start_pos:end_pos].lower()
        discipline_list = find_discipline(tmp_str)

        m_list_full.append(("Master", item[0].replace('.', '').replace(' ', ''), " ".join(discipline_list)))

    edu_filtered = []
    if len(b_list_full) > 0:
        for item in edu_list:
            if item[0] != "Bachelor":
                edu_filtered.append(item)

    if len(m_list_full) > 0:
        for item in edu_list:
            if item[0] != "Master":
                edu_filtered.append(item)

    edu_filtered = sorted(list(set(edu_list + b_list_full + m_list_full)))

    return_list = []
    for item in edu_filtered:
        if item[1] != "NA" and item[2] != "NA":
            return_list.append(item[0] + ", " + item[1] + ", " + item[2])
        elif item[1] != "NA":
            return_list.append(item[0] + ", " + item[1])
        elif item[2] != "NA":
            return_list.append(item[0] + ", " + item[2])
        else:
            return_list.append(item[0])

    return return_list
Exemplo n.º 9
0
from flashtext import KeywordProcessor
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

keyword_processor = KeywordProcessor()

with open(BASE_DIR + "/utils/keywords") as keywords:
    for keyword in keywords.readlines():
        keyword_processor.add_keyword(keyword.strip())


def tagEx(data):
    try:
        keywords_found = keyword_processor.extract_keywords(data)
    except Exception as e:
        return "ERR"

    return list(set(keywords_found))
Exemplo n.º 10
0
def init_flashtext(words, size):
    keyword_processor = KeywordProcessor()
    for i in range(size):
        keyword_processor.add_keyword(words[i])
    return keyword_processor
Exemplo n.º 11
0
with open("attack_classifier/assets/tokenizer.pickle", "rb") as handle:
    tokenizer = pickle.load(handle)


def standardize(word):
    return unidecode.unidecode(word.lower())


with open("attack_classifier/assets/contractions.txt") as json_file:
    contraction_mapper = json.load(json_file)

contraction_processor = KeywordProcessor()

for k, v in contraction_mapper.items():
    contraction_processor.add_keyword(k, v)


def replace_contraction(text):
    return contraction_processor.replace_keywords(text)


def remove_non_alpha(text):
    regex = re.compile("[^a-zA-Z\s]")
    return regex.sub("", text)


def tokenize(text):
    tokens = text.split(" ")
    tokens = [t for t in tokens if t != ""]
    return tokens
 def keysense(self):
     keyword_processor = KeywordProcessor(case_sensitive=True)
     keyword_processor.add_keyword(self.add_keyword)
     keywords_found = keyword_processor.extract_keywords(self.text)
     return keywords_found
Exemplo n.º 13
0
def init_results_v8(data_list,
                    gt_data_list,
                    terms_based_resutls,
                    g_score_dict,
                    match_filtering_k=3,
                    term_retrieval_top_k=5,
                    multihop_retrieval_top_k=None):
    # 2019-04-06
    # The complete v7 version of retrieval

    ner_set = get_title_entity_set()

    # dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE)
    print("Total data length:")
    print(len(data_list))

    # We load term-based results
    print("Load term-based results.")
    terms_based_results_dict = dict()
    for item in terms_based_resutls:
        terms_based_results_dict[item['qid']] = item

    # Load tf-idf_score function:
    # g_score_dict = dict()
    # load_from_file(g_score_dict,
    #                config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    keyword_processor = KeywordProcessor(case_sensitive=True)
    keyword_processor_disamb = KeywordProcessor(case_sensitive=True)

    print("Build Processor")
    for kw in tqdm(ner_set):
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            # matched_key_word is the original matched span. we need to save it for group ordering.
            matched_obj = _MatchedObject(matched_key_word=kw,
                                         matched_keywords_info={kw: 'kwm'})
            keyword_processor.add_keyword(kw, matched_obj)
    #
    for kw in wiki_util.title_entities_set.disambiguation_group:
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            if kw in keyword_processor:
                # if the kw existed in the kw_processor, we update its dict to add more disamb items
                existing_matched_obj: _MatchedObject = keyword_processor.get_keyword(
                    kw)
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    if disamb_kw not in existing_matched_obj.matched_keywords_info:
                        existing_matched_obj.matched_keywords_info[
                            disamb_kw] = 'kwm_disamb'
            else:  # If not we add it to the keyword_processor_disamb, which is set to be lower priority
                # new_dict = dict()
                matched_obj = _MatchedObject(matched_key_word=kw,
                                             matched_keywords_info=dict())
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb'
                    # new_dict[disamb_kw] = 'kwm_disamb'
                keyword_processor_disamb.add_keyword(kw, matched_obj)

    doc_pred_dict = {'sp_doc': dict(), 'raw_retrieval_set': dict()}
    # doc_pred_dict_p1 = {'sp_doc': dict(), 'raw_retrieval_set': dict()}

    for item in tqdm(data_list):
        question = item['question']
        qid = item['_id']

        query_terms = get_query_ngrams(question)
        valid_query_terms = [
            term for term in query_terms if term in g_score_dict
        ]

        retrieved_set = RetrievedSet()

        # This method will add the keyword match results in-place to retrieved_set.
        get_kw_matching_results(question, valid_query_terms, retrieved_set,
                                match_filtering_k, g_score_dict,
                                keyword_processor, keyword_processor_disamb)

        # Then we add term-based matching results
        added_count = 0
        for score, title in sorted(terms_based_results_dict[qid]['doc_list'],
                                   key=lambda x: x[0],
                                   reverse=True)[:term_retrieval_top_k + 3]:
            if not filter_word(title) and not filter_document_id(title):
                retrieved_set.add_item(RetrievedItem(title, 'tf-idf'))
                added_count += 1
                if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k:
                    break

        # Add hyperlinked pages:
        finded_keys_set = set(
            retrieved_set.to_id_list()
        )  # for finding hyperlinked pages we do for both keyword matching and disambiguration group.
        # .3 We then add some hyperlinked title
        db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB)

        for keyword_group in finded_keys_set:
            flatten_hyperlinks = []
            hyperlinks = wiki_db_tool.get_first_paragraph_hyperlinks(
                db_cursor, keyword_group)
            for hls in hyperlinks:
                flatten_hyperlinks.extend(hls)

            for hl in flatten_hyperlinks:
                potential_title = hl.href
                if potential_title in ner_set and not filter_word(
                        potential_title) and not filter_document_id(
                            potential_title
                        ):  # important bug fixing 'or' to 'and'
                    # hyperlinked_title.append(potential_title)

                    # if not filter_document_id(potential_title):
                    score = get_query_doc_score(valid_query_terms,
                                                potential_title, g_score_dict)
                    retrieved_set.add_item(
                        retrieval_utils.RetrievedItem(potential_title,
                                                      'kwm_disamb_hlinked'))
                    retrieved_set.score_item(potential_title,
                                             score,
                                             namespace=keyword_group +
                                             '-2-hop')

        for keyword_group in finded_keys_set:
            retrieved_set.sort_and_filter(keyword_group + '-2-hop',
                                          top_k=multihop_retrieval_top_k)

        doc_pred_dict['sp_doc'][qid] = retrieved_set.to_id_list()
        doc_pred_dict['raw_retrieval_set'][qid] = retrieved_set

    if gt_data_list is not None:
        ext_hotpot_eval.eval(doc_pred_dict, gt_data_list)
    return doc_pred_dict
Exemplo n.º 14
0
import random
from multiprocessing import Manager
from queue import Queue

from flashtext import KeywordProcessor
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

kp = KeywordProcessor()
with open('keywords.txt', 'r') as f:
    for word in f:
        kp.add_keyword(word.strip().lower())

thread_count = 10
procManager = Manager()
request_queue = procManager.Queue()
# request_queue=Queue()


def task(n):
    print(n)
    try:
        rand = random.Random()
        while True:
            with open('{}.txt'.format(rand.randint(1000, 2000)), 'w') as f:

                docs = request_queue.get()
                if docs != None:
                    keywords_found = kp.extract_keywords(docs)
                    print(keywords_found)
                    f.write("\n".join(keywords_found))
                    f.flush()
Exemplo n.º 15
0
def add(extract):
    keyword_processor = KeywordProcessor()
    r = keyword_processor.add_keyword(extract)
    return r
Exemplo n.º 16
0
    def main(name):
        keyword_processor = KeywordProcessor()

        def convert_list_to_string(org_list, seperator=' '):

            return seperator.join(org_list)

        metinler = [
            'ne', 'zaman'
        ]  # bu şuan sadece "ne zaman" ile sorulan sorulara cevap verebiliyor
        #kaynak:https://python.yemreak.com/temel/string-islemleri

        g = requests.get("https://www.google.com/")
        gugıl = BeautifulSoup(g.content)
        print(gugıl.find_all("gLFyf gsfi"))

        soru = name  # girilen string ifadeyi google da aratıyor
        # ne zaman sorusuna cevap
        #-------------------------------------------------------------------------------------------------------------------------------------------------------
        if (
                all(metin in soru for metin in metinler)
        ):  #sorulan soruda ne zaman yerine cevap olabilecek kelimeleri değiştiriyor

            tarihleriarasında = "tarihleri arasında"
            result1 = re.sub(r"ne zaman", tarihleriarasında,
                             soru)  #stringlerde replace metodu da olabilir
            #kaynak:https://stackabuse.com/using-regex-for-text-manipulation-in-python/

        if (all(metin in soru for metin in metinler)):

            tarihinde = "tarihinde"
            result2 = re.sub(r"ne zaman", tarihinde, soru)
            #DÜZENLENECEK

        if (all(metin in soru for metin in metinler)):

            saatlerinde = "saatlerinde"
            result2 = re.sub(r"ne zaman", saatlerinde, soru)
            #DÜZENLENECEK
        if (all(metin in soru for metin in metinler)):

            saatleriarasında = "saatleri arasında"
            result2 = re.sub(r"ne zaman", saatleriarasında, soru)
        if (all(metin in soru for metin in metinler)):

            kadardevamedecek = "kadar devam edecek"
            result2 = re.sub(r"ne zaman", kadardevamedecek, soru)
        if (all(metin in soru for metin in metinler)):

            tarih = "tarih"
            result2 = re.sub(r"ne zaman", tarih, soru)
            #DÜZENLENECEK

        # bulunabiliecek ifademiz
        #print(result1)
        #print(result2)

        #-------------------------------------------------------------------------------------------------------------------------------------------------------
        #try:
        #    liste=result1.split(tarihleriarasında) #DÜZENLENECEK
        #    stringifade1 =convert_list_to_string(liste)# burada split ettiğimiz liste değişkenini stringe çevirdik
        #except:
        #    print("bu ifade bulunamıyor")
        ##şuan stringifade1 kullanılmıyor ama ilerde kullanılacak
        ##kaynak:https://thispointer.com/python-how-to-convert-a-list-to-string/#:~:text=Convert%20list%20to%20string%20in%20python%20using%20join()%20in,a%20function%20join()%20i.e.&text=join()%20function%20accepts%20an,it%20returns%20the%20concatenated%20string.

        #ilerde url kısmını otomatik olarak alacağız (RPA ile çözülebilir)
        for j in search(soru, tld="co.in", num=10, stop=1, pause=2):
            url = j  #burada en başta sorulan soru sonucunda gelen ilk urlyi aldım
            #kaynak:https://www.geeksforgeeks.org/performing-google-search-using-python-code/

        r = requests.get(url)

        #r= requests.get("https://www.hurriyet.com.tr/galeri-hafta-sonu-sokaga-cikma-yasagi-ne-zaman-saat-kacta-bitiyor-23-25-nisan-sokaga-cikma-yasagi-saatleri-41795118/3")
        r.content
        # kaynak:https://docs.python-requests.org/en/master/
        soup = BeautifulSoup(r.content)
        soup.prettify(
        )  #burada web sitesinden aldığımız verileri düzenliyoruz ama çok gerek yok
        textamastringdegil = soup.find_all()  #tüm siteyi taradı
        #kaynak:https://www.crummy.com/software/BeautifulSoup/bs4/doc/
        text = str(textamastringdegil)  # ve text değişkenine koyduk
        #intenet siteindeki kaynak kodlarını txt dosyasına ttık

        #path="C:/Users/berk/Desktop/21.05-güncel-proje/siteveri.txt"
        #dosya = open(path, 'a')
        #dosya.write(text)
        #dosya.close()

        #Eski algoritma-----------------------------------------------------------------------------------------------------------------------------------------------------
        if re.search(
                tarihleriarasında, text
        ):  #kaynak:https://stackabuse.com/using-regex-for-text-manipulation-in-python/
            print("Match found")
        else:
            print("Match not found")

        if re.search(tarihinde, text):
            print("Match found")
        else:
            print("Match not found")

        if re.search(saatlerinde, text):
            print("Match found")
        else:
            print("Match not found")

        if re.search(saatleriarasında, text):
            print("Match found")
        else:
            print("Match not found")
        if re.search(kadardevamedecek, text):
            print("Match found")
        else:
            print("Match not found")
        if re.search(tarih, text):
            print("Match found")
        else:
            print("Match not found")

        #-------------------------------------------------------------------------------------------------------------------------------------------------------
        nezamanakarsilik = [
            tarihleriarasında, tarihinde, saatlerinde, saatleriarasında,
            kadardevamedecek, tarih
        ]
        liste = []
        for x in nezamanakarsilik:
            keyword_processor.add_keyword(
                x
            )  #string 3 ü textin içinde arıyor ve bulduğu zaman belli bir aralığı veriyor
            keywords_found = keyword_processor.extract_keywords(text,
                                                                span_info=True)
            #kaynak:https://flashtext.readthedocs.io/en/latest/

            try:
                aralık = keywords_found[0]  # try ex k
                kontrol = aralık[1]

                adet = 0
                while (adet !=
                       1):  #ANAHTAR KELİMEDEN SOLA DOĞRU 1.NOKTAYA kadar
                    print("--")
                    if (text[kontrol] == "."):
                        print("ifade noktaya eşit")
                        adet += 1

                    kontrol -= 1
                    bas = kontrol

                adett = 0

                while (adett !=
                       2):  #ANAHTAR KELİMEDEN SAĞA DOĞRU 2.NOKTAYA kadar
                    print("--")
                    if (text[kontrol] == "."):
                        print("ifade noktaya eşit")
                        adett += 1

                    kontrol += 1
                    son = kontrol

                print(bas)
                print(son)
                bass = bas + 1
                sonn = son - 1
                print(text[bass:son])
                son = text[bass:son]
                liste.append(son)

                #print(text[(aralık[1]-34):aralık[2]])# bu kısım soru  zarfına cevap olabilecek aralığı çıktı olarak veriyor
                #print(str(aralık[1]))
                #print(str(aralık[2]))
                #son=text[(aralık[1]-40):(aralık[2]+50)]
                #liste.append(son)
                ## son olarak soruya cevap olabilecek sayırları verdi
            except IndexError:
                print(x + "-> bu kelime internet sitesinde yer almıyor")
                son = "-> bu kelime internet sitesinde yer almıyor"
        yazdır = ""
        for x in liste:
            yazdır += x
            yazdır += "-"
        print(yazdır)
        return yazdır
Exemplo n.º 17
0
from flashtext import KeywordProcessor

client = pymongo.MongoClient(settings.MONGO_DB_URI)
db = client[settings.MONGO_DB_NAME]
collection = db[settings.MONGO_COLLECTION_NAME]
deleteColl = db["deleteColl"]  #deleteColl
mydoc = collection.find()
kw_list = ["新冠", "疫情", "抗疫", "病例", "冠状病毒", "疾控中心", "核酸检测", "医学观察"]
filter_list = [
    "扫黑除恶", "自由贸易", "光盘行动", "消费", "六稳六保", "防汛抗旱", "税租减免", "电网", "脱贫攻坚", "进出口",
    "慰问", "复工复产", " 服贸会"
]
keyword_processor = KeywordProcessor()
filter_processor = KeywordProcessor()
for keyword in kw_list:
    keyword_processor.add_keyword(keyword)
for filter_item in filter_list:
    filter_processor.add_keyword(filter_item)
for item in mydoc:
    if item["title"] is None:
        print(item["detail_url"])
        print(item["province"])
    else:
        if isinstance(item["title"], list):
            item["title"] = item["title"][0]
            pass
        find_title = keyword_processor.extract_keywords(item["title"])
        find_content = keyword_processor.extract_keywords(item["content"])
        filter_title = filter_processor.extract_keywords(item["title"])
        filter_content = filter_processor.extract_keywords(item["content"])
        total_score = len(find_title) * 2 + len(
Exemplo n.º 18
0
        Returns
        -------
        Counter noun chunk -> count
        '''
        return Counter(self.keyword_processor_.extract_keywords(str(doc)))


keyword_processor = KeywordProcessor(case_sensitive=False)
for phrase in [
        'the president', 'presidents', 'presidential', 'barack obama',
        'mitt romney', 'george bush', 'george w. bush', 'bill clinton',
        'ronald regan', 'obama', 'romney', 'barack', 'mitt', 'bush', 'clinton',
        'reagan', 'mr. president', 'united states of america'
]:
    keyword_processor.add_keyword(phrase)
feature_extractor = FlashTextExtact().set_keyword_processor(keyword_processor)

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(
    st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromPandas(convention_df,
                              category_col='party',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences,
                              feats_from_spacy_doc=feature_extractor).build())

print(corpus.get_term_freq_df())

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
    corpus,
Exemplo n.º 19
0
# functions for keywords extraction and tagging

# get self-defined dictionary form self-defined label table
labelen = pd.read_excel('./model/dictionary/labelen.xlsx')


def getdict():
    sdic = pd.DataFrame()
    sdic['term'] = pd.concat(
        [labelen[column].dropna() for column in labelen.columns],
        ignore_index=True)
    return sdic['term'].dropna()


for term in getdict():
    keyword_processor.add_keyword(term)

import re
import unicodedata


# extract words from English text if words in self-define dictionary
def searchword(text):
    word = keyword_processor.extract_keywords(text)
    # phrase = phrase.lower()
    # text = text.lower()
    # word = re.search(r'\b{}\b'.format(phrase), text, re.IGNORECASE)
    return word


# Clean English text,return mathch extracted words
Exemplo n.º 20
0
    # ----- Extract data from sources
    st = json.load(open(st_path + file))
    print('\n    * The number of attributes is equal to: '
          + str(len(st[0]['attributes'])))

    # ------ Extract data from complete rdf files
    complete = open(rdf_path + name + '/complete.nt').read()
    # Get the URIs included within diamonds <> and encode them
    # TODO: should be put as utils clearner
    matches = re.findall(r'\<(.*?)\>', complete)
    matches = list(
        map((lambda x: {x: urllib.parse.quote(x, safe='http://')}), matches))

    for index in range(len(matches)):
        for key in matches[index]:
            keyword_processor.add_keyword(key, matches[index][key])

    complete_updated = keyword_processor.replace_keywords(complete)

    g = rdflib.Graph()
    graph = g.parse(data=complete_updated, format='turtle')

    res = g.query(
        """
            SELECT (COUNT (DISTINCT ?s) AS ?num_entities)
            WHERE {
                ?s ?p ?o
            }
            """)

    for row in res:
Exemplo n.º 21
0
        documents.append(" ".join(
            [word for word in translation.split() if word not in STOP_WORDS]))
        sentences.append(translation)

keyword_processor = KeywordProcessor()
vectorizer = TfidfVectorizer()

results_kw = []
results_tfidf = []

query = normalize("کیا سود حرام ہے")
k_words = set()
for word in query.split():
    if word not in STOP_WORDS:
        k_words.add(word)
        keyword_processor.add_keyword(word)

t_matches = len(k_words)

for sentence in documents:
    keywords_found = set(keyword_processor.extract_keywords(sentence))
    n_common = len(k_words.intersection(keywords_found))
    if n_common == t_matches:
        results_kw.append(sentence)

vectors = vectorizer.fit_transform([query] + documents)

cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten()
document_scores = [item.item() for item in cosine_similarities[1:]]

tf_idf = [
Exemplo n.º 22
0
def Delete_StandardStopwords(username,
                             prname):  # 1차 불용어 처리 (불용어 사전을 새로 수정하고 만들어야 합니다.)
    import re, os
    from tqdm import tqdm
    from flashtext import KeywordProcessor
    from utils import Read_Arg_, Read_Sheet_, import_dataframe, export_dataframe
    kp = KeywordProcessor()
    tqdm.pandas()

    if (username == "") & (prname == ""):
        input_directory = ""  # Non-창민버전
    else:
        input_directory = "/".join([username, prname])  # Non-창민버전
    ref, input_, output_ = Read_Arg_(
        username, prname, "Delete_StandardStopwords"
    )  # Read_Arg를 통해 참조파일, input파일, output파일을 불러옵니다.
    # 이 때 ref는 "JDic_BizStopwords(경영불용어사전)"시트를,
    # input파일은 메세지 csv파일의 이름,
    # output은 처리 후 내보낼 메세지 csv파일의 이름입니다.
    Sym2Remain = Read_Sheet_(username, prname, "SymbolsDictionary")
    Clean = Read_Sheet_(username, prname, ref)  # Clean이라는 변수에 Read_Sheet를 통해
    # "JDic_BizStopwords(경영불용어사전)"시트를 불러옵니다.
    Clean.columns = Clean.iloc[0]
    Clean = Clean[1:]
    Clean["unit_length"] = Clean["word"].apply(
        lambda x: len(x))  # 이 때 하나의 이슈는 표현의 길이에 따른 나열 순서입니다.
    # (https://greeksharifa.github.io/정규표현식(re)/2018/07/22/regex-usage-03-basic/)
    # 만약 "에게"와 "에게서"를 예시로 들 떄,
    # 정규식 인자로 "에게"가 "에게서"보다 먼저 나열될 경우,
    # 메세지에서 "에게"에 대한 데이터를 먼저 찾으므로
    # 실제로 메세지에서 "에게서" 라고 표현되었던 데이터가
    # "에게"로 인해 "서"만으로 남게됩니다.
    # 따라서 위 방법을 사용하게 될 경우,
    # 불용어 사전의 칼럼으로 unit_length를 두고
    # 내림차순으로 정렬하는 것이 바람직해 보입니다.

    Sym2Remain_np = Sym2Remain.fillna("").to_numpy(dtype=list)
    all_V = list(map(lambda x: [i for i in x if i != ""],
                     Sym2Remain_np))  # all_V라는 변수에 lemma에 있는 데이터들을 전부 가져옵니다.
    print(all_V)
    # 이 때 all_V의 형태는 다음과 같습니다.
    # [[기준단어 a, 변형단어 a-1, 변형단어 a-2,... ],
    #  [기준단어 b, 변형단어 b-1, 변형단어 b-2,... ],
    #  ... ]
    for case in all_V:
        standardised = case[1]
        for keyword in case[0]:
            kp.add_keyword(keyword, standardised)
    print(kp.get_all_keywords())

    Clean = Clean.sort_values(
        by="unit_length",
        ascending=False)  # unit_length열 기준으로 내림차순으로 정렬해주고 최신화합니다.
    Clean = Clean[~Clean["word"].duplicated()]  # 혹시모를 중복 word를 제거합니다.

    # symbol = set(Clean.loc[Clean["class"] == "s", "word"])  # 기호는 Clean 데이터 프레임에서
    # # "class" 컬럼이 s인 것들의 "word"컬럼을 리스트화한 것입니다.
    # symbol = str(symbol).replace("{", "").replace("}", "").replace(", ''", "").replace(", ", "|").replace("'", "")

    # print(symbol)

    characters = set(Clean.loc[Clean["class"] == "c",
                               "word"])  # 문자는 Clean 데이터 프레임에서
    # "class" 컬럼이 c인 것들의 "word"컬럼을 리스트화한 것입니다.
    characters = str(characters).replace("{", "").replace("}", "").replace(
        ", ''", "").replace(", ", "|").replace("'", "")

    # 정규표현식을 사용하기 위한 작업입니다.
    # 본디, JDic_Clean은 ["물론", "무엇", "무슨" …] 처럼 리스트의 형태를 취합니다.
    # 정규표현식을 조작하게끔 하는 라이브러리 re는 인자로 문자열을 받습니다.
    # 따라서 리스트를 문자열로 바꿔줍니다. ( str(JDic_Clean) )
    # 또한, 정규식에는 .sub 메소드가 있는데,
    # 이는 세번째 인자(데이터)에서 첫번째 인자(특정 표현)를 발견하면
    # 두번째 인자로 바꿔주는 메소드입니다.
    # 아래에서 item(메세지 데이터의  각 행에 해당하는 데이터) 데이터에서
    # 불용어사전에 등록된 표현을 찾아 공백으로 바꿔주고자 합니다.
    # 이 때, 불용어 사전에 등록된 단어를 하나하나 바꿔주기 보다,
    # or식( | )을 써서 한번에 lookup하고자 합니다.
    # 그러기 위해서는 정규식의 인자에 들어가야할 형태는 다음과 같습니다.
    # "표현 1"|"표현 2"|"표현3"|…"
    # 더욱이 "ㅜ" 같은경우 "ㅜㅜ"로 메세지에서 발견될 수 있습니다.
    # 이는 정규식 내 +를 넣어주면 해결됩니다.
    # +는 해당표현이 1번이상 반복되는 경우를 뜻합니다.
    # 해당표현 바로 뒤에 +를 써줘 정규식에 넣어줘야 합니다.
    # 따라서 위 Clean_Candidates에는 다음과 같이 형태가 이루어져 있습니다.
    # "표현 1 +"|"표현 2 +"|"표현 3 +"|...

    # def save_symbol(item):  # lemmatize라는 함수를 정의합니다.
    #     input = item
    #     while True:
    #         input_revised = kp.replace_keywords(input) #input문장을 replace시켜준 후 item_replaced 변수에 저장
    #         if input == input_revised: #이전 문장과 수정 후 문장이 같다면 더이상 고칠게 없다는 의미이므로 반복문 탈출
    #             break
    #         else: # 이전 문장과 다르다면 바꿔줘야할 것이 있었다는 소리이므로 계속 진행. item_revised를 다시 이전의 값을 뜻하는 input으로 변경
    #             input = input_revised
    #             pass
    #     return input_revised

    def save_symbol2(item):  # lemmatize라는 함수를 정의합니다. (무식 버전)
        item_revised = item
        for i in range(len(Sym2Remain["decode"])):
            item_revised = item_revised.replace(
                Sym2Remain.iloc[i]["decode"],
                " " + Sym2Remain.iloc[i]["Encode"] + " ")
        return item_revised

    # def Clean_symbol(item):  # Clean_stopwords라는 사용자정의함수를 정의합니다.
    #     item_edited = re.sub(symbol, " ", item)  # 이는 정규표현식을 통해 item(input_Message의 각행의 데이터)에 대해
    #     # Clean_candidates에 해당하는 패턴이 나올 시 " "(공백)으로 치환해주는 함수입니다.
    #     item_edited = " ".join(item_edited.split())  # 다중공백도 제거해줍니다.
    #     return item_edited  # 이 함수의 리턴값을 치환된 데이터로 최신화된 데이터로 내보내도록 합니다.

    # def add_space_for_symbol(item):
    #    not_words = list(filter(bool, list(set(re.compile("[^\s*\w*\s*]*").findall(item)))))
    #    for end in not_words:                                     # 메세지의 한 행에서 있는 not_words리스트 요소마다
    #        item = item.replace(end," "+end)                                    # replace메소드를 통해 스페이스를 첨가해줍니다.
    #    return item

    def Clean_char(item):  # Clean_stopwords라는 사용자정의함수를 정의합니다.

        item_edited = re.sub(
            characters, " ",
            item)  # 이는 정규표현식을 통해 item(input_Message의 각행의 데이터)에 대해
        # Clean_candidates에 해당하는 패턴이 나올 시 " "(공백)으로 치환해주는 함수입니다.
        item_edited = " ".join(item_edited.split())  # 다중공백도 제거해줍니다.
        return item_edited  # 이 함수의 리턴값을 치환된 데이터로 최신화된 데이터로 내보내도록 합니다.

    def Clean_leftover_symbols(item):
        item_edited = re.sub("[^\w\s]", "", item)
        item_edited = " ".join(item_edited.split())  # 다중공백도 제거해줍니다.
        return item_edited

    input_name = os.path.join(input_directory, input_)
    input_Message = import_dataframe(input_name)
    input_Message = input_Message[
        input_Message["contents"].notna()]  # input Message에 있을 결측치(빈칸)을 제거합니다.

    input_Message["contents"] = input_Message["contents"].progress_apply(
        save_symbol2)
    #input_Message["contents"] = input_Message["contents"].progress_apply(Clean_symbol)
    # Clean_stopwords를 .apply메소드를 통해 적용시킵니다.
    # input_Message["contents"] = input_Message["contents"].progress_apply(add_space_for_symbol) #살릴 기호들 앞에 스페이스를 첨가해줍니다.
    input_Message["contents"] = input_Message["contents"].progress_apply(
        Clean_char)
    # Clean_stopwords를 .apply메소드를 통해 적용시킵니다.
    input_Message["contents"] = input_Message["contents"].progress_apply(
        Clean_leftover_symbols)

    output_name = os.path.join(input_directory, output_)
    export_dataframe(input_Message, output_name)

    return input_Message  # Delete_Characters의 리턴값으로 최신화된 데이터프레임으로 내보내도록 합니다.
Exemplo n.º 23
0
#Flash text is an amazing new library which allows us to find and replace words in a document.
#It uses a tree data structure called Trie for efficient information storage and retrieval.
#Both finding and replacement happens over a single pass.
#It is blazing fast compared to Regular Expressions,provided that your data is of comparable size.

from flashtext import KeywordProcessor
document = """Welcome to Fractal's World.Fractal Analytics is one of the leading analytics companies in India"""
processor = KeywordProcessor()
processor.add_keyword('Fractal')
found = processor.extract_keywords(document)
print(found)

#Searching for synonyms
processor.add_keywords_from_dict({'Fractal': ['Fractal', "Fractal's"]})
found = processor.extract_keywords(document)
print(found)

#Listing the location of the keywords
processor.add_keywords_from_dict({'Fractal': ['Fractal', "Fractal's"]})
found = processor.extract_keywords(document, span_info=True)
print(found)
Exemplo n.º 24
0
from flashtext import KeywordProcessor

string = 'hello from Jamaica! I need grapes'

keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Jamaica', 'J Town')

new_sentence = keyword_processor.replace_keywords(string)
print(new_sentence)
# should replace Jamaica with J Town, mon.
Exemplo n.º 25
0
import os
# import flashtext
from flashtext import KeywordProcessor
import fileinput

kp = KeywordProcessor()

kp.add_keyword('另外', '现在')
kp.add_keyword("第二部分", "第三部分")
# print(os.path.abspath('.')
# print(os.path.abspath(os.path.dirname(__file__)))
BASE_DIR = os.path.abspath(os.path.dirname(__file__))

print(BASE_DIR)
# for i in os.listdir(BASE_DIR):
#     print(i)
#     if i == "06.txt":
# j = kp.replace_keywords(i)
i = BASE_DIR + '/06.txt'
for line in fileinput.input(i):
    j = kp.replace_keywords(line)
    print(j)
Exemplo n.º 26
0
if args.freq_words:
    with open(args.freq_words, 'rt', encoding="utf-8") as f:
        n = 0
        for line in f:
            n += 1
            w = line.rstrip().lower()
            freq_words[w] = n

if args.bl_words:
    with open(args.bl_words, 'rt', encoding="utf-8") as f:
        for line in f:
            if line[0] == '#':
                continue
            w = line.rstrip()
            bl_words.add_keyword(w)

if args.bl_subreddits:
    with open(args.bl_subreddits, 'rt', encoding="utf-8") as f:
        for line in f:
            if line[0] == '#':
                continue
            s = line.rstrip().lower()
            bl_subreddits[s] = 1

if args.ignore_keys:
    args.keep_keys = None
    args.discard_tgt_keys = None
else:
    if args.keep_keys:
        keys = load_keys(args.keep_keys)
#!/usr/bin/python3
# coding: utf-8

# http://blog.csdn.net/CoderPai/article/details/78574863
# https://github.com/vi3k6i5/flashtext

# 关键词搜索
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword(<别名>, <标准词>)
keyword_processor.add_keyword('广东', '广东省')
keyword_processor.add_keyword('北京')
keywords_found = keyword_processor.extract_keywords('北京与广东都是中国的')
print(keywords_found)
# ['北京', '广东省']

# 关键词替换
keyword_processor.add_keyword('A卡', '邂逅a卡')
new_sentence = keyword_processor.replace_keywords('a卡与b卡哪个卡好')
print(new_sentence)
# Out[40]: '邂逅a卡与b卡哪个卡好'


# 提取关键词,区分大小写字母
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keyword('A卡', '邂逅a卡')
keyword_processor.add_keyword('b卡')
keywords_found = keyword_processor.extract_keywords('a卡与b卡哪个卡好?')
print(keywords_found)
# ['b卡']
Exemplo n.º 28
0
from flashtext import KeywordProcessor

keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('New Delhi', 'NCR region')
new_sentence = keyword_processor.replace_keywords('I love nlp and new delhi.')
print(new_sentence)
import sys
import re
from flashtext import KeywordProcessor

bl_words = KeywordProcessor()
en_words = KeywordProcessor()

with open('offensive.txt', 'r') as of1:
    for line in of1:
        bl_words.add_keyword(line.strip())

with open('top50.txt', 'r') as tp50:
    for line in tp50:
        en_words.add_keyword(line.strip())

# print(bl_words)

# bad_words = bl_words.extract_keywords('shut up asshole ?')
# print(bad_words)


def filter_instance(src, tgt, info='info'):
    # Remove offensive words:
    # do not have the gold list of offensive words
    if bl_words:
        bad_words = bl_words.extract_keywords(tgt)
        if bad_words:
            print("skip\toffensive\t%s\t%s\tbad word(s): %s" %
                  (info, tgt, bad_words),
                  file=sys.stderr)
            return True
Exemplo n.º 30
0
def make_ne_founder(db, case):
    keyword_processor = KeywordProcessor(case_sensitive=case)
    for vocab in tqdm(db.vocab_list):
        keyword_processor.add_keyword(vocab)
    db.setting(keyword_processor)
Exemplo n.º 31
0
class AddressExtractor:
    """Text processor to extract French addresses based on city name and postal code.

    The main entry point is the `extract_addresses()` method. An `OCRResult` is
    searched for addresses in the following way:

    * The text is prepared by taking it lower case, removing accents, and replacing
      the characters ' and - with " " (space), as city names must follow this format.
    * City names are searched for in the text.
    * For each city name found, its corresponding postal code is searched for in the
      surrounding text, at a maximum distance of `postal_code_search_distance`.
    * If the postal code is found, the match is added to the list of returned
      addresses, along with an extract of the text surrounding the address,
      at a maximum distance of `text_extract_distance`.

    Args:
        cities (iterable of City): Set of cities to search for.
        postal_code_search_distance (int, optional, default 10): Maximum distance
            from a city name to search for a postal code.
        text_extract_distance (int, optional, default 30): Amount of text surrounding a
            detected address to extract for returning.
    """
    def __init__(
        self,
        cities: Iterable[City],
        postal_code_search_distance: int = 10,
        text_extract_distance: int = 30,
    ):
        self.cities = cities
        self.postal_code_search_distance = postal_code_search_distance
        self.text_extract_distance = text_extract_distance

        self.cities_processor = KeywordProcessor()
        for city in self.cities:
            self.cities_processor.add_keyword(city.name, city)

    def extract_addresses(self, content: Union[str,
                                               OCRResult]) -> List[Prediction]:
        """Extract addresses from the given OCR result.

        Args:
            content (OCRResult or str): a string or the OCR result to process.

        Returns:
            list of Prediction: List of addresses extracted from the text. Each entry
            is a dictionary with the items: country_code (always "fr"), city_name,
            postal_code and text_extract.
        """
        if isinstance(content, OCRResult):
            text = self.get_text(content)
        else:
            text = content

        text = self.normalize_text(text)
        city_matches = self.find_city_names(text)

        locations = []
        for city, city_start, city_end in city_matches:
            pc_match = self.find_nearby_postal_code(text, city, city_start,
                                                    city_end)
            if pc_match is None:
                continue

            pc, pc_start, pc_end = pc_match
            address_start = min(city_start,
                                pc_start) - self.text_extract_distance
            address_end = max(city_end, pc_end) + self.text_extract_distance
            text_extract = text[max(0, address_start
                                    ):min(len(text), address_end)]

            locations.append(
                Prediction(
                    type=PredictionType.location,
                    data={
                        "country_code": "fr",
                        "city_name": city.name,
                        "postal_code": city.postal_code,
                        "text_extract": text_extract,
                    },
                ))

        return locations

    @staticmethod
    def get_text(ocr_result: OCRResult) -> str:
        """Extract text from the OCR result and prepare it.

        Args:
            ocr_result (OCRResult): The OCR result to process.

        Returns:
            str: The text extracted and prepared.
        """
        text = ocr_result.get_full_text()
        if text is None:
            # Using `OCRResult.text_annotations` directly instead of
            # `OCRResult.get_text_annotations()` because the latter contains
            # the text duplicated
            text = ocr_result.text_annotations[0].text
        return text

    @staticmethod
    def normalize_text(text: str) -> str:
        text = text.lower()
        text = strip_accents_ascii(text)
        return text.replace("'", " ").replace("-", " ")

    def find_city_names(self, text: str) -> List[Tuple[City, int, int]]:
        """Find all cities from the search set in the text.

        Args:
            text (str): Text to search city names in.

        Returns:
            list of (City, int, int): The list of `City`s which name was found in the
            text, with the start and end indices of their names locations in the
            text. Empty list if none found.
        """
        return self.cities_processor.extract_keywords(text, span_info=True)

    def find_nearby_postal_code(
            self, text: str, city: City, city_start: int,
            city_end: int) -> Optional[Tuple[str, int, int]]:
        """Search for a city's postal code close to its name in the text.

        The postal code is searched at a maximum distance of
        `postal_code_search_distance` from the city name.

        Assumes digit-only postal code, allows non-digit directly next to it. For
        example, for the city "paris" with postal code "75000", "75000 paris" and
        "fr75000 paris" will match.

        Args:
            text (str): The OCR result text.
            city (City): The `City` for which to search the postal code.
            city_start (int): Start index of the city name match in `text`.
            city_end (int): End index of the city name match in `text`.

        Returns:
            (str, int, int) or None: If the `City`'s postal code was found close to
            the city name match, it is returned along with its start and end indices
            in the text. If it was not found, returns None.
        """
        if not city.postal_code.isdigit():
            logger = get_logger("{}.{}".format(self.__module__,
                                               self.__class__.__name__))
            logger.error("postal code contains non-digit characters: %s", city)
            return None
        pattern = r"(?:[^0-9]|^)({})(?:[^0-9]|$)".format(city.postal_code)

        sub_start = max(0, city_start - self.postal_code_search_distance)
        sub_end = min(len(text), city_end + self.postal_code_search_distance)
        sub_text = text[sub_start:sub_end]

        match = re.search(pattern, sub_text)
        if match is None:
            return None
        else:
            return match.group(
                1), sub_start + match.start(1), sub_start + match.end(1)