Пример #1
0
def shingles_file(f):
    '''
    Function for getting shingles for a file f
    '''
    print("Started getting shingles for file \"{0}\"".format(f.path_txt))
    start_time = time.time()
    if (f.processed == 1):
            f_tmp = open(f.path_txt, "r")
            f.text = f_tmp.read()
            f_tmp.close()
            candidate_keywords = generateCandidateKeywords(f.text, stopwords, lemmatizer = lemmatizer.get())
            for phrases in candidate_keywords:
                for word in phrases:
                    f.words.append(word)
            f.shingles = shingles.gen_shingles(f.words)
            if (len(f.shingles) == 0):
                print("No shingles were built in \"{0}\"".format(f.path_txt))
                f.processed = -1
                return
    end_time = time.time()
    print("Getting shingles for file \"{0}\"\ntook {1:.3f}".format(f.path_txt, end_time - start_time))
Пример #2
0
    def run(self):
        """
        Downloading files
        """
        try:
            threadLock.acquire()
            print("{0}) Downloading \"{1}\"".format(self._id, self._file.url), end = "\n\n")
            threadLock.release()
            self._file.path = download.download(self._file.url, directory = DOWNLOAD_DIR, callback = None)
            if (self._file.path == ""):
                threadLock.acquire()
                print("Error, during downloading (server is not corresponding)")
                print("\"{0}\"".format(self._file.url), end = "\n\n")
                threadLock.release()
                self._file.appropriate = False
                self._return = None
                return
        except:
            threadLock.acquire()
            print("Download \"{0}\" failed".format(self._file.url), end = "\n\n")
            threadLock.release()
            exit(0)
        else:
            threadLock.acquire()
            print("File \"{0}\"".format(self._file.url))
            print("Saved in \'{0}\'".format(self._file.path), end = "\n\n")
            threadLock.release()

        """
        Checking hash sum for coinsidence
        """
        self._file.makehash()
        if (self._file.hash == self._compare_file.hash):
            self._file.similarity = "HASH_COINSIDENCE"
            self._return = self._file
            '''
            You can delete it
            '''
            threadLock.acquire()
            print('////////////////////////////////////////////')
            print("Similarity with file")
            print("\'{0}\'".format(self._file.url))
            print("is")
            print("HASH_COINSIDENCE")
            print('////////////////////////////////////////////', end = "\n\n")
            threadLock.release()
            return

        """
        Parsing PDFs into TXTs
        """
        self._file.path_txt = self._file.path + ".txt"
        threadLock.acquire()
        start_time = time()
        print("Started parsing \"{0}\"".format(self._file.path), end = "\n\n")
        threadLock.release()
        parse_result = parse_pdf(self._file)
        end_time = time()
        threadLock.acquire()
        print("Parsing \"{0}\" took {1:.3f}".format(self._file.path, end_time - start_time), end = "\n\n")
        threadLock.release()
        if (self._file.appropriate == False):
            self._return = None
            return

        threadLock.acquire()
        print("Started getting shingles for file \"{0}\"".format(self._file.path_txt), end = "\n\n")
        threadLock.release()

        start_time = time()
        self._file.text = get_text(self._file)
        self._file.words = get_words(self._file, self._stopwords)
        self._file.shingles = shingles.gen_shingles(self._file.words)
        end_time = time()
        if len(self._file.shingles) == 0:
            threadLock.acquire()
            print("No shingles were built in file \"{0}\"".format(self._file.path_txt), end = "\n\n")
            threadLock.release()
            self._return = None
            return
           
        threadLock.acquire() 
        print("Shingles were built successfully for an file \"{0}\"".format(self._file.path_txt))
        print("Took about {0:.3f}s".format(end_time - start_time), end = "\n\n")
        threadLock.release()

        """
        Comparing, files, using shingles
        """
        threadLock.acquire()
        print("Started getting similarity based on shingles in file \"{0}\"".format(self._file.path_txt), end = "\n\n")
        threadLock.release()
        start_time = time()
        self._file.similarity = shingles.compare(self._compare_file.shingles, self._file.shingles)
        end_time = time()
        threadLock.acquire()
        print("Ended comparing shingles for file \"{0}\"".format(self._file.path_txt))
        print("Took {0:.3f}s".format(end_time - start_time), end = "\n\n")
        threadLock.release()

        threadLock.acquire()
        print('////////////////////////////////////////////')
        print("Similarity with file")
        print("\'{0}\'".format(self._file.url))
        print("is")
        print("{0:.3f}".format(self._file.similarity))
        print('////////////////////////////////////////////', end = "\n\n")
        threadLock.release()

        self._return = self._file
Пример #3
0
'''
Started getting words from txts
'''
print("\nGetting shingles for txt files")
for f in files:
    if (f.processed == 1):
        words = []
        f_tmp = open(f.path_txt,"r")
        text = f_tmp.read()
        f_tmp.close()
        candidate_keywords = generateCandidateKeywords(text, stopwords, lemmatizer)
        for sublist in candidate_keywords:
            for word in sublist:
                words.append(word)
        f.words = words
        f.shingles = shingles.gen_shingles(words)
        if (len(f.shingles) == 0):
            print("\nNo shingles were built in file \"{0}\"\nPossible, a file is empty - PDF parse error\n".format(f.path_txt))
            f.processed = -1


"""
Work with input file
"""
print("Started work with main file")
input_file_path = "./Task/Выделение набора ключевых слов/0470749822.pdf"
input_file = file(path = input_file_path)
try:
    retcode = subprocess.call(["python", "./pdf_import.py", f.path_pdf, f.path_txt])
    if (retcode != 0):
        print("Error while parsing {0}".format(f.path_pdf))
Пример #4
0
    EXTRA_TIME -= time()
    print("Getting keyword phrases took {0:.3f}".format(end_time - start_time), "seconds", end = "\n\n")

    if (output_file.path_txt != None):
        out = open(output_file.path_txt, "w")
        for key in keywords:
            out.write("{0}\n".format(key[0]))
        out.close()

    """
    Started getting shingles for input file
    """
    print("Started getting shingles for input file \"{0}\"".format(input_file.path_txt))
    start_time = time()
    input_file.words = file_process.get_words(input_file, stopwords)
    input_file.shingles = shingles.gen_shingles(input_file.words)
    end_time = time()
    if len(input_file.shingles) == 0:
        print("No shingles were built in input file")
        exit(0)
    print("Shingles were built successfully for an input file \"{0}\"".format(input_file.path_txt))
    print("Took about {0:.3f}s".format(end_time - start_time), end = "\n\n")

    """
    Started building query for Google
    """
    query = "("
    for wordphrase in input_file.keywords:
        query += wordphrase[0] + ")+("
    query = query[:-2]
    print("Getting links from Google search")