Пример #1
0
def get_neg_sentence(token, text):
    # gets the sentence in which the token appears in only if the processed sentence is negative
    sentences = text.split('.')
    for sentence in sentences:
        words = sentence.split()
        for word in words:
            if (process_text(word) == token
                    and get_sentiment(process_text(sentence)) < 0
                    and get_sentiment(sentence) < 0):
                return sentence
    return None
Пример #2
0
def run_basic_search(query):
    file_index = 0
    directory = '../input'
    for file in os.listdir(directory):
        if file.endswith("html"):
            with open(os.path.join(directory, file), "r", encoding='utf8', errors='ignore') as f:
                html = f.read()
                words = process_text(html)
                frequency = 0
                word_index = 0
                indexes = []
                for word in words:
                    word = process_word(word)
                    if word != "":
                        if word in query:
                            frequency += 1
                            indexes.append(word_index)
                    word_index += 1

            if frequency > 0:
                results[file_index] = [frequency, file, indexes]

            file_index += 1

    sorted_results = sorted(results.values(), key=lambda kv: kv[0], reverse=True)
    get_results(sorted_results)
Пример #3
0
def get_results(sorted_res):

    for value in sorted_res:
        path = '../input/' + value[1]

        with open(path, "r", encoding='utf8', errors='ignore') as f:
            html = f.read()
            word_list = process_text(html)
            snippet = print_snippet(word_list, value[2])
            print("%d\t\t\t %s\t\t\t\t\t%s" % (value[0], value[1], snippet))
Пример #4
0
Файл: gui.py Проект: Khady/ocr
    def refresh(self):
        if len(self.files) == 0:
            return

        cv_image = cv2.cvtColor(preprocess.process_text(self.files[self.i]), cv2.COLOR_GRAY2RGB)
        pil_image = Image.fromarray(cv_image)
        pil_image = pil_image.resize((300,300), PIL.Image.ANTIALIAS)
        pil_image.save("/tmp/i.gif")

        gif1 = Tkinter.PhotoImage(file = "/tmp/i.gif")
        self.canvas.create_image(50, 10, image = gif1, anchor = Tkinter.NW)
        self.root.refresh()
                                
        return
Пример #5
0
    def refresh(self):
        if len(self.files) == 0:
            return

        cv_image = cv2.cvtColor(preprocess.process_text(self.files[self.i]),
                                cv2.COLOR_GRAY2RGB)
        pil_image = Image.fromarray(cv_image)
        pil_image = pil_image.resize((300, 300), PIL.Image.ANTIALIAS)
        pil_image.save("/tmp/i.gif")

        gif1 = Tkinter.PhotoImage(file="/tmp/i.gif")
        self.canvas.create_image(50, 10, image=gif1, anchor=Tkinter.NW)
        self.root.refresh()

        return
Пример #6
0
def run_sqlite_search(query):
    conn = sqlite3.connect('inverted-index.db')
    c = conn.cursor()
    sql = '''    
        SELECT p.documentName AS docName, SUM(frequency) AS freq, GROUP_CONCAT(indexes) AS idxs
        FROM Posting p
        WHERE
            p.word IN ({seq})
        GROUP BY p.documentName
        ORDER BY freq DESC;'''.format(seq=','.join(['?'] * len(query)))
    cursor = c.execute(sql, query)

    for row in cursor:

        path = '../input/' + row[0]

        with open(path, "r", encoding='utf8', errors='ignore') as f:
            html = f.read()
            words = process_text(html)
            indexes = [int(s) for s in row[2].split(',')]
            snippet = print_snippet(words, indexes)
            print("%d\t\t %s\t\t\t%s" % (row[1], row[0], snippet))
Пример #7
0
def indexer():

    directory = '../input'

    print('Indexing...')
    for file in os.listdir(directory):
        if file.endswith("html"):
            with open(os.path.join(directory, file),
                      "r",
                      encoding='utf8',
                      errors='ignore') as f:
                html = f.read()
                words = process_text(html)
                unique_words = []
                for word in words:
                    word = process_word(word)
                    if word != "":
                        try:
                            write_to_index_word(word)
                        except Exception as e:
                            pass
                        if word not in unique_words:
                            generate_posting(word, words, file)
                            unique_words.append(word)