Exemplo n.º 1
0
    def get_main_text(self):
        doc = Document(self._page.content, positive_keywords=re.compile('event-description__text|event-heading__title|event-heading__argument', re.I))
        title = doc.title()
        summary = doc.summary(html_partial=True)

        self.summary_bs = BeautifulSoup(summary, 'html.parser')

        strings = []
        for div in self.summary_bs.find_all(['div', 'span', 'body']):
            strings.extend([string for string in div.stripped_strings if
                            string != "" and re.search(r'[<>{}=\[\]\|]', string) is None])
        text = "\n".join(strings)
        preprocessed_text = TextUtils.handle(text)
        return '{}\n{}'.format(' '.join(TextUtils.handle(title)), ' '.join(preprocessed_text))
Exemplo n.º 2
0
    def get_text(self):
        if self.soup is None:
            return ""
        strings = []

        for div in self.main_soup.find_all(['div', 'span', 'body']):
            strings.extend([
                string for string in div.stripped_strings
                if string != "" and re.search(r'[<>{}=\[\]\|]', string) is None
            ])
        text = "\n".join(strings)
        preprocessed_text = TextUtils.handle(text)
        return ' '.join(preprocessed_text)
Exemplo n.º 3
0
 def __init__(self, frontier, dir_to_save, dir_checkpoints,
              checkpoints_name, lock, inv_index, file_description):
     self.dir_checkpoints = dir_checkpoints
     self.frontier = frontier
     self.dir_to_save = dir_to_save
     self.documents = {}
     self.file_description = file_description
     self.checkpoints_name = checkpoints_name
     self.steps_count = 0
     self.inv_index = inv_index
     self.enities_wrapper = EntityTableWrapper()
     self.fileattribute_wrapper = FileAttributeTableWrapper()
     self.lock = lock
     self.entertainment_words = TextUtils.handle(' '.join(
         get_entertainments_words()))
Exemplo n.º 4
0
    def __init__(self,
                 url,
                 text,
                 type=None,
                 time=None,
                 date=None,
                 price=None,
                 city=None,
                 venue=None,
                 name=None):
        self.url = url
        self.time = time
        self.date = date
        if self.time is None or self.date is None:
            for s in text:
                try:
                    d = dparser.parse(s, fuzzy=True)
                    if self.time is None:
                        self.time = str(d.time())
                    if self.date is None:
                        self.date = str(d.date())
                    break
                except:
                    continue

        self.type = type
        self.price = price
        self.venue = venue

        self.city = city
        if city is None:
            cities = GeoText(text).cities
            max_cnt = -1
            max_city = None
            words = TextUtils.text_to_words(text)
            for city in cities:
                cnt = words.count(city)
                if cnt > max_cnt:
                    max_cnt = cnt
                    max_city = city
            self.city = max_city

        self.name = name
        if self.name is None:
            pieces = url.split('/')
            self.name = pieces[len(pieces) - 1].split('?', 1)[0]
Exemplo n.º 5
0
 def get_text(self):
     text = self.get_row_text()
     preprocessed_text = TextUtils.handle(text)
     return ' '.join(preprocessed_text)
Exemplo n.º 6
0
def process(query, city, lock, checkpoint_path, descr_file):
    # TODO: check & debug

    lock.acquire()
    try:
        with open(checkpoint_path, 'rb') as check_file:
            crawler_loaded = pickle.load(check_file)
            _, _, _, inv_index = crawler_loaded
    except Exception as err:
        print(err)
        return '0\nPlease, wait until we gather some content. Try again later.'
    finally:
        lock.release()

    preprocessed_query = TextUtils.handle(query)
    query_info = {}
    for word in preprocessed_query:
        lock.acquire()
        word_doc_info_list = inv_index.get_index(word)
        lock.release()
        query_info[word] = word_doc_info_list

    tf = gather_info(query_info)  # td[doc][word]
    urls = get_id_url(descr_file)
    cnt_docs = len(urls)
    idf = dict([(word, math.log(1.0*cnt_docs/len(query_info[word]))) for word in query_info if len(query_info[word]) != 0]) # idf[word]
    docs = gather_info(query_info)

    score = {}  # BM25

    paf = FileAttributeTableWrapper()

    k1 = 1.5  # TODO: const to config
    b = 0.75

    for doc in docs:
        score[doc] = 0
        url = urls[doc]
        if '&date=' in url or '&version=mobile' in url or '?source=menu' in url:
            continue
        _, _, doc_dl, _, _ = paf.get_row(doc)
        dl_av = paf.get_average()  # TODO: Want faster
        for word in preprocessed_query:
            if word in tf[doc]:
                tf_doc_word = tf[doc][word]
            else:
                tf_doc_word = 0
            score[doc] += 1.0*idf[word]*(k1 + 1)*tf_doc_word/(k1*((1 - b) + 1.0*b*doc_dl/dl_av) + tf_doc_word)

    if len(score) == 0:
        print("Can't find anything for query {}".format(query))

    ranking_docs_without_city = sorted(score, key=score.get, reverse=True)
    if city != 'Выберите город':
        ranking_docs = []
        for doc in ranking_docs_without_city:
            doc_city = table.get_row(doc)[6]
            if doc_city is not None and doc_city.lower() == eng[city]:
                ranking_docs.append(doc)
    else:
        ranking_docs = ranking_docs_without_city

    best_urls = []
    for i in range(min(15, len(ranking_docs))):
        url = urls[ranking_docs[i]]
        result = url + "\t"

        # get snippet text
        page = Page(url)
        page.retrieve()
        row_text = TextUtils.text_to_words(page.get_row_text())
        only_words = TextUtils.only_words(row_text)
        preprocessed_text = TextUtils.stem(only_words)
        for word in preprocessed_query:
            context, index = TextUtils.search(word, only_words, preprocessed_text, 4)
            if context is None:
                continue
            for ind, w in enumerate(context):
                if ind != index:
                    result += w + " "
                else:
                    result += "<span style=\"color: #00a93b;font-weight:bold\">" + w + "</span>"
            result += "\n"

        print(result)
        best_urls.append(result)

    if len(best_urls) == 0:
        return '2\n\n' + "Can't find result on this query."
    return '1\n\n' + config.get_map_access_token() + '\n\n' + '\n'.join(best_urls)