Python string_to_tokens 예제들, search_engine.parsing_utils.string_to_tokens Python 예제들

예제 #1

0

파일 보기

파일: show_article.py 프로젝트: jefffang19/ir_hw

def show_pk_article(request, pk, keyword):
    a = Article.objects.get(pk=pk)
    tokens = string_to_tokens(a.abstract)
    keyword_stem = string_to_tokens(keyword)

    pos = []
    for w in tokens:
        if keyword_stem[0][0] == w[0]:
            pos.append(w[1])

    _abstract = ""

    # a bug happened because \n
    for j in a.abstract:
        if (j != '\n'):
            _abstract += j
        else:
            _abstract += ' '

    render_dict = {
        'title': a.title,
        'abstract': _abstract.split(' '),
        'keyword': keyword,
        'pos': pos
    }

    return render(request, 'search_engine/show_an_article.html', render_dict)

예제 #2

0

파일 보기

파일: api.py 프로젝트: jefffang19/ir_hw

def create_revindex(request):
    a = Article.objects.all()
    word_freq = {}
    for i in a:
        for j in string_to_tokens(i.abstract):
            if j[0] != 'nan':
                w = Word(context = j[0], pos_in_a_article = j[1])
                w.save()
                w.position.add(i)

    return HttpResponse('Create Reverse index success')

예제 #3

0

파일 보기

파일: api.py 프로젝트: jefffang19/ir_hw

def create_stem_freq(request):
    a = Article.objects.all()
    word_freq = {}
    for i in a:
        for j in string_to_tokens(i.abstract):
            if j[0] != 'nan':
                if j[0] in word_freq.keys():
                    word_freq[j[0]] += 1
                else:
                    word_freq[j[0]] = 0

    # sort the dict by value
    word_freq = {k : v for k, v in sorted(word_freq.items(), key=lambda  item: item[1], reverse=True)}

    #save to database
    for j in word_freq.keys():
        sf = StemFreq(word = j, frequency = word_freq[j])
        sf.save()

    return HttpResponse('StemFreq created')

예제 #4

0

파일 보기

def get_subset(keyword):
    from search_engine.parsing_utils import string_to_tokens
    corrected_keyword = keyword

    # query for the data subset
    keywords_cleaned = string_to_tokens(corrected_keyword)
    articles_pk = []  # get the target articles' pk

    for i in keywords_cleaned:
        w = Word.objects.filter(context=i[0])

        for j in w:
            mode = j.position.get()
            # append article pk
            if mode.pk not in articles_pk:
                articles_pk.append(mode.pk)

    words_freq = {}
    words = []
    # get the target articles' words
    for i in articles_pk:
        w = Word.objects.filter(position__id=i)
        for j in w:
            if j.context not in words_freq.keys():
                words_freq[j.context] = 1
            else:
                words_freq[j.context] += 1
            words.append(j.context)

    # sort the dict by value
    words_freq = {
        k: v
        for k, v in sorted(
            words_freq.items(), key=lambda item: item[1], reverse=True)
    }

    return words_freq, words

예제 #5

0

파일 보기

def stemmed(keyword):
    from search_engine.parsing_utils import string_to_tokens
    keywords_cleaned = string_to_tokens(keyword)

    for i in keywords_cleaned:
        return i[0]

예제 #6

0

파일 보기

파일: views_hw1.py 프로젝트: jefffang19/ir_hw

def show_articles(request, first=False):
    # POST => process the form data from user
    # if request.method == 'POST':
    if not first and request.method == 'POST':
        form = WordForm(request.POST)
        if form.is_valid():
            # parse and clean (stemming..etc) the keywords
            keywords_cleaned = string_to_tokens(form.cleaned_data['keywords'])
            # search keywords in db (Model => Word)
            all_words = []
            for i in keywords_cleaned:
                # retrive all word
                q_set = Word.objects.filter(context=i[0])
                for j in q_set:
                    temp = {}
                    temp['word'] = j.context  # keyword name
                    temp['pos'] = j.pos_in_a_article  # position in the doc
                    # look up in which docs
                    # get many to many table
                    q_art = j.position.get()

                    # when calculating the which doc, we do ( pk - current_firstpk )
                    q3 = Article.objects.filter()[0]
                    article_firstpk = q3.pk

                    temp['docs'] = q_art.pk - article_firstpk

                    all_words.append(temp)

            # now we render the show page
            all_articles = Article.objects.all()

            len_article = len(all_articles)  #count the num of articles

            arts = []
            _str = ""

            # a bug happened because \n
            for i in all_articles:
                for j in i.abstract:
                    if (j != '\n'):
                        _str += j
                    else:
                        _str += ' '

                arts.append(_str)
                _str = ""

            arts = [i.split(' ') for i in arts]  # articles break into words

            # count sentence num
            tot_sc = 0
            sep_sc = []
            for sc in all_articles:
                tot_sc += count_sent(sc.abstract)
                sep_sc.append(count_sent(sc.abstract))

            # count the num of words
            tot_words = 0
            len_sep_words = []

            for wo in arts:
                tot_words += len(wo)
                len_sep_words.append(len(wo))

            # make sure which docs has keywords
            # format : [doc1, doc2, doc3] (type int)
            key_docs = []
            for i in all_words:
                if i['docs'] not in key_docs:
                    key_docs.append(i['docs'])

            # make sure where the keywords are in the coresponding doc
            # format : {doc1 : [pos1, pos2], doc2 : [pos3, pos4]}  (type int)
            key_pos = {}
            for i in all_words:
                if i['docs'] not in key_pos.keys():
                    key_pos[i['docs']] = [
                        i['pos'],
                    ]
                else:
                    key_pos[i['docs']].append(i['pos'])

            form = WordForm()

            #count

            words = {
                'form': form,
                'len_article': len_article,
                'tot_sc': tot_sc,
                'sep_sc': sep_sc,
                'len_words': tot_words,
                'sep_words': len_sep_words,
                'articles': arts,
                'num_result': len(all_words),
                'keylines': key_docs,
                'keywords': key_pos
            }
            # structure of words
            # {
            #   'articles' : a list of articles to show on template,
            #   'keylines' : a list of which line has words to highlight,
            #   'keywords' : a dict of which word to highlight in the corresponding line,
            #                   key is int , value is list
            # }

            # debug
            # return JsonResponse({'keylines' : key_docs  , 'keywords' : key_pos})

            return render(request, 'search_engine/show_articles.html', words)

    else:
        form = WordForm()
        all_articles = Article.objects.all()

        len_article = len(all_articles)  #count the num of articles
        arts = [i.abstract.split(' ')
                for i in all_articles]  # articles break into words
        # count the num of words
        tot_words = 0
        len_sep_words = []

        # count sent num
        tot_sc = 0
        sep_sc = []
        for sc in all_articles:
            tot_sc += count_sent(sc.abstract)
            sep_sc.append(count_sent(sc.abstract))

        for wo in arts:
            tot_words += len(wo)
            len_sep_words.append(len(wo))
        words = {
            'form': form,
            'articles': [i.abstract.split(' ') for i in all_articles],
            'tot_sc': tot_sc,
            'sep_sc': sep_sc,
            'len_article': len_article,
            'len_words': tot_words,
            'sep_words': len_sep_words,
        }

        # return HttpResponse(loader.get_template('search_engine/show_articles.html').render(words , request))
        return render(request, 'search_engine/show_articles.html', words)

예제 #7

0

파일 보기

파일: views_hw1.py 프로젝트: jefffang19/ir_hw

def get_keywords(request):

    # POST => process the form data from user
    if request.method == 'POST':
        form = WordForm(request.POST)
        if form.is_valid():
            # parse and clean (stemming..etc) the keywords
            keywords_cleaned = string_to_tokens(form.cleaned_data['keywords'])
            # search keywords in db (Model => Word)
            all_words = []
            for i in keywords_cleaned:
                # retrive all word
                q_set = Word.objects.filter(context=i[0])
                for j in q_set:
                    temp = {}
                    temp['word'] = j.context  # keyword name
                    temp['pos'] = j.pos_in_a_article  # position in the doc
                    # look up in which docs
                    # get many to many table
                    q_art = j.position.get()

                    # when calculating the which doc, we do ( pk - current_firstpk )
                    q3 = Article.objects.filter()[0]
                    article_firstpk = q3.pk

                    temp['docs'] = q_art.pk - article_firstpk

                    all_words.append(temp)

            # now we render the show page
            all_articles = Article.objects.all()

            # make sure which docs has keywords
            # format : [doc1, doc2, doc3] (type int)
            key_docs = []
            for i in all_words:
                if i['docs'] not in key_docs:
                    key_docs.append(i['docs'])

            # make sure where the keywords are in the coresponding doc
            # format : {doc1 : [pos1, pos2], doc2 : [pos3, pos4]}  (type int)
            key_pos = {}
            for i in all_words:
                if i['docs'] not in key_pos.keys():
                    key_pos[i['docs']] = [
                        i['pos'],
                    ]
                else:
                    key_pos[i['docs']].append(i['pos'])

            words = {
                'articles': [i.abstract.split(' ') for i in all_articles],
                'num_result': len(all_words),
                'keylines': key_docs,
                'keywords': key_pos
            }

            return HttpResponse(
                loader.get_template('search_engine/show_articles.html').render(
                    words, request))

            # return HttpResponse(all_words)

    # GET => create blank form
    else:
        form = WordForm()

    return render(request, 'search_engine/search_page.html', {'form': form})

예제 #8

0

파일 보기

def zipf_ct(request):
    # form = WordForm()

    keyword = 'ct'

    corrected_keywords = [
        'mask', 'children', '2003', '2019', 'pneumothorax', 'pneumonia',
        'image', 'adult', 'bat', 'immune', 'wuhan', 'vaccin'
    ]

    return_dict = {}

    # query for the data subset
    for ck in corrected_keywords:
        keywords_cleaned = string_to_tokens(ck)
        articles_pk = []  # get the target articles' pk

        w = Word.objects.filter(context=keywords_cleaned[0][0])

        for j in w:
            mode = j.position.get()
            # append article pk
            if mode.pk not in articles_pk:
                articles_pk.append(mode.pk)

        words = {}
        # get the target articles' words
        for i in articles_pk:
            w = Word.objects.filter(position__id=i)
            for j in w:
                if j.context not in words.keys():
                    words[j.context] = 1
                else:
                    words[j.context] += 1

        # sort the dict by value
        words = {
            k: v
            for k, v in sorted(
                words.items(), key=lambda item: item[1], reverse=True)
        }

        # stemming words
        _words = list(words.keys())
        _freq = list(words.values())

        keyword_rank = 0

        # calculate keyword ranking in each subset
        for i in range(len(_words)):
            if _words[i] == keyword:
                keyword_rank = i

        return_dict['{}_words'.format(ck)] = _words
        return_dict['{}_freq'.format(ck)] = _freq
        return_dict['{}_subset_article_num'.format(ck)] = len(_words)
        return_dict['{}_rank'.format(ck)] = keyword_rank

    # return JsonResponse(return_dict)

    return render(request, 'search_engine/chart_ct.html', return_dict)

예제 #9

0

파일 보기

def zipf_search(request, subset=0):
    # for spelling correction
    from spellchecker import SpellChecker

    spell = SpellChecker()

    form = WordForm()
    corrected_keyword = ''
    search_title = ''

    # user search
    if request.method == 'POST':
        form = WordForm(request.POST)
        # get search word
        if form.is_valid():
            # parse and clean (stemming..etc) the keywords
            origin_keyword = form.cleaned_data['keywords']
            misspelled = spell.unknown([origin_keyword])
            corrected_keyword = ''
            search_title = ''

            if len(misspelled) != 0:
                for w in misspelled:
                    corrected_keyword = spell.correction(w)
                search_title = 'Do you mean \"{}\". Showing search results for {}'.format(
                    corrected_keyword, corrected_keyword)
            else:
                corrected_keyword = origin_keyword
                search_title = 'search results for {}'.format(origin_keyword)

    # predefine subset
    elif subset == 0:
        corrected_keyword = 'mask'
        search_title = 'search results for mask'

    elif subset == 1:
        corrected_keyword = '2005'
        search_title = 'search results for 2005'

    elif subset == 2:
        corrected_keyword = 'children'
        search_title = 'search results for children'

    # query for the data subset
    keywords_cleaned = string_to_tokens(corrected_keyword)
    titles = {}  # get the target articles names
    articles_pk = []  # get the target articles' pk
    articles_raw_words = {}  # get the target articles' abstract raw words

    for i in keywords_cleaned:
        w = Word.objects.filter(context=i[0])

        for j in w:
            mode = j.position.get()
            title = mode.title
            # append article pk
            if mode.pk not in articles_pk:
                articles_pk.append(mode.pk)
                # append raw abstract words
                for k in mode.abstract.split(' '):
                    if k not in articles_raw_words.keys():
                        articles_raw_words[k] = 1
                    else:
                        articles_raw_words[k] += 1
            # append article title
            if title not in titles.keys():
                titles[title] = 1
            else:
                titles[title] += 1

    # sort the dict by value
    titles = {
        k: v
        for k, v in sorted(
            titles.items(), key=lambda item: item[1], reverse=True)
    }

    articles_raw_words = {
        k: v
        for k, v in sorted(
            articles_raw_words.items(), key=lambda item: item[1], reverse=True)
    }

    titles_name = list(titles.keys())
    titles_freq = list(titles.values())

    # get sort article pk
    articles_pk = []
    for i in titles_name:
        a = Article.objects.filter(title=i)[0]
        articles_pk.append(a.pk)

    words = {}
    # get the target articles' words
    for i in articles_pk:
        w = Word.objects.filter(position__id=i)
        for j in w:
            if j.context not in words.keys():
                words[j.context] = 1
            else:
                words[j.context] += 1

    # sort the dict by value
    words = {
        k: v
        for k, v in sorted(
            words.items(), key=lambda item: item[1], reverse=True)
    }

    # calculate zipf
    # of = OriginFreq.objects.filter(word == )
    # sf = StemFreq.objects.all()
    title = str(corrected_keyword)

    # stemming words
    top100_words = list(words.keys())[:100]
    freq = list(words.values())

    # calculate stemmed covid19 set word rank
    stem_covid_set_freq = []
    stem_covid_set_rank = []
    stem_covid_set_first_pk = StemFreq.objects.filter()[0].pk
    for i in top100_words:
        a = StemFreq.objects.get(word=i)
        stem_covid_set_freq.append(a.frequency)
        stem_covid_set_rank.append(a.pk - stem_covid_set_first_pk)

    # origin words
    top100_words_raw = list(articles_raw_words.keys())[:100]
    freq_raw = list(articles_raw_words.values())

    # calculate original covid19 set word rank
    origin_covid_set_freq = []
    origin_covid_set_rank = []
    origin_covid_set_first_pk = OriginFreq.objects.filter()[0].pk
    for i in top100_words_raw:
        a = OriginFreq.objects.get(word=i)
        origin_covid_set_freq.append(a.frequency)
        origin_covid_set_rank.append(a.pk - origin_covid_set_first_pk)

    return_dict = {
        'search_title': search_title,
        'titles_name': titles_name,
        'titles_freq': titles_freq,
        'chart_title': title,
        'form': form,
        'top_words': top100_words,
        'freq': freq,
        'top_words_raw': top100_words_raw,
        'freq_raw': freq_raw,
        'stem_covid_set_freq': stem_covid_set_freq,
        'stem_covid_set_rank': stem_covid_set_rank,
        'origin_covid_set_freq': origin_covid_set_freq,
        'origin_covid_set_rank': origin_covid_set_rank,
        'article_pk': articles_pk
    }

    return render(request, 'search_engine/chart_search.html', return_dict)