Python Preprocessor示例，preprocessor.Preprocessor Python示例

示例#1

0

显示文件

def preProcessTweets():
    processedTweets = []
    with open("E:/twitter10k.csv", newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')

        for row in reader:
            try:
                #print(counter)
                #print(row)
                tweet = row[-2]
                #print(tweet)
                #print('############')
                preProcessor = preprocessor.Preprocessor()
                processedTweet = preProcessor.preprocess_text(tweet)
                #print(processedTweet)
                for word in processedTweet:
                    if (word != 'http' and word != 'get' and word != 'is'
                            and word != 'ny' and word != 'lol'
                            and word != 'lol' and word != 'na' and word != 'u'
                            and word != '-' and word != 'us' and word != 'im'):
                        processedTweets.append(word)
                #print('done processing ############')
            except Exception as e:
                i = 1
                #print(str(e))

        return processedTweets

示例#2

0

显示文件

文件： imagelist_dataset.py 项目： superannotateai/building_detection

    def __init__(self,
                 image_file_paths,
                 mean_image=None,
                 bandstats_file_path=None,
                 original_size_x=650,
                 original_size_y=650,
                 input_size=256,
                 slice_count_x=1,
                 slice_count_y=1,
                 is_8_channel=True):
        super().__init__()
        # logger.info("Creating image list dataset from {} images".format(str(len(image_file_paths))))
        self.preprocessor = preprocessor.Preprocessor(
            datapath=None,
            original_size_x=original_size_x,
            original_size_y=original_size_y,
            input_size=input_size,
            slice_count_x=slice_count_x,
            slice_count_y=slice_count_y,
            is_8_channel=is_8_channel)
        # Change location of bandstats file, it will not figure out on its own.
        self.preprocessor.path_mgr.bandstats_file = bandstats_file_path
        self.image_file_paths = image_file_paths
        self.slice_count = slice_count_x * slice_count_y
        self.current_image_path = ""
        self.is_8_channel = is_8_channel
        self.preloaded_slices = {}

        # TODO(martun): later change mean substraction as a transformation.
        self.mean_image = mean_image

示例#3

0

显示文件

文件： multi.py 项目： shimon-lb/Voodoo-Mock_support_running_singletest_by_name

def voodooOneFile(fullName, inputPath, fileList):
    fullOutput = fullOutputName(fullName, inputPath)
    mkdirOf(fullOutput)
    output = ''
    try:
        output += voodoo.voodoo(input=fullName,
                                output=fullOutput,
                                pathToRemoveFromIdentifier=inputPath,
                                voodooDBFile=args.voodooDB,
                                includes=args.includePath,
                                defines=args.define,
                                trace=False,
                                preIncludes=args.preInclude)
        state = "V"
    except Exception, e:
        if str(e).find("all argume") != -1:
            raise
        inputLines = voodoo._readLinesOfFile(fullName)
        prepro = preprocessor.Preprocessor(fullName, fullOutput, inputLines,
                                           inputPath)
        output += prepro.intercepter()
        output += "\n/* The error that forced interception:\n" + \
                    str( e ).replace( "*/", "* /" ) + "\n"
        output += "\n"
        output += "Voodoo stack trace:\n" + traceback.format_exc()
        output += "*/\n"
        output += "\n"
        state = "I"

示例#4

0

显示文件

def main():
    pp = preprocessor.Preprocessor()
    csvs = []
    csvs.extend(pp.do_udc())
    csvs.extend(pp.do_cmdc())
    csvs.extend(pp.do_wiki())
    csvs.extend(pp.do_bdc())
    lexicon.Lexicon(csvs)

示例#5

0

显示文件

文件： main.py 项目： nvisary/Bachelor-work

def preprocessor(argv):
    mp3_path = argv[2]
    book_path = argv[3]
    print("You run preprocessor.")
    print("Path to mp3: " + mp3_path)
    print("Path to book: " + book_path)
    preprocessor = pr.Preprocessor(mp3_path, book_path, PREPROCESSOR_PATH)
    preprocessor.preprocess()

示例#6

0

显示文件

 def set_preprocessor(self, preprocessor_):
     if preprocessor_ is None:
         preprocessor_ = [preprocessor.Preprocessor()]
     elif type(preprocessor_) is not list:
         preprocessor_ = [preprocessor_]
     self.preprocessors = preprocessor_
     Xtrain, ytrain = self.split(self.training_data, self.target_column)
     for pp in self.preprocessors:
         pp.fit(Xtrain)
         Xtrain = self._preprocess_one(Xtrain, pp)

示例#7

0

显示文件

文件： Mess_executor.py 项目： lpratalimaffei/AutoNonBoltzmann

 def __init__(self, nominal_src, nominal_file, conditions_ls):
     self.src = nominal_src
     self.input_name = nominal_file
     self.nominal_model = preprocessor.Preprocessor(nominal_src +
                                                    nominal_file)
     self.nominal_model.clean_input()
     self.nominal_model.generate_species_classes()
     self.Temp_ls = [conditions_ls[0]]  # should be a list
     self.Pres_ls = [conditions_ls[1]]  # should be a list
     self.Energy_grid = conditions_ls[2]  # should be a float
     self.new_ne_file = []

示例#8

0

显示文件

    def setUp(self):
        # initial runtime environment
        args = {"config_file": "../config_omniphotos.yaml"}
        self.preprocessor = preprocessor.Preprocessor(args)

        self.preprocessor.root_dir = \
            pathlib.Path("D:/workdata/testDatasets/circular/KyotoShrines_test")
        self.preprocessor.image_output_path= \
            pathlib.Path("D:/workdata/testDatasets/circular/KyotoShrines_test/Input")
        self.preprocessor.FPS = 50
        self.preprocessor.omniphotos_config_template_path = \
            "D:/workspace/Python/preprocessing/template/config.yaml.template"

示例#9

0

显示文件

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--project_dir", type=str, required=True)
    parser.add_argument("--report_dir", type=str, required=True)
    parser.add_argument("--num_files_to_print",
                        type=int,
                        required=False,
                        default=20)
    args = parser.parse_args()

    project_dir = args.project_dir
    report_dir = args.report_dir
    num_files_to_print = args.num_files_to_print

    project_dir = "../data/ZXing"
    report_dir = "../data/ZXing/ZXingBugRepository.xml"

    project_report_info = preprocessor.Preprocessor(project_dir, report_dir)
    similarity_info = similarity_calculator.SimilarityCalculator(
        project_report_info.project_frequency_dict,
        project_report_info.report_frequency_dict,
        project_report_info.xml_report, num_files_to_print)

    project_dir = "../data/Rhino"
    report_dir = "../data/Rhino/RhinoBugRepository.xml"

    project_report_info = preprocessor.Preprocessor(project_dir, report_dir)
    similarity_info = similarity_calculator.SimilarityCalculator(
        project_report_info.project_frequency_dict,
        project_report_info.report_frequency_dict,
        project_report_info.xml_report, num_files_to_print)

    project_dir = "../data/JodaTime/"
    report_dir = "../data/JodaTime/JodaTimeBugRepository.xml"

    project_report_info = preprocessor.Preprocessor(project_dir, report_dir)
    similarity_info = similarity_calculator.SimilarityCalculator(
        project_report_info.project_frequency_dict,
        project_report_info.report_frequency_dict,
        project_report_info.xml_report, num_files_to_print)

示例#10

0

显示文件

def preprocess_input(document,
                     lower=True,
                     remove_punctuation=False,
                     remove_stop_words=False):
    preprocessor = pp.Preprocessor()
    if lower:
        document = document.lower()
    if remove_punctuation:
        document = preprocessor.remove_punctuation(document)
    if remove_stop_words:
        document = preprocessor.remove_stop_words(document,
                                                  german=True,
                                                  english=True)
    return document

示例#11

0

显示文件

    def make_dataframe(self):
        test_df = pd.read_csv(self.test_path, names=['x', 'y'])

        test_data = preprocessor.Preprocessor(test_df, self.model.vocab)
        test_data.tokenize('x')
        test_data.add_tags('x')
        test_data.lemmatize('x')

        test_data.update_dataframe('x', 'y')

        test_data.data['pos_score'], test_data.data[
            'neg_score'], test_data.data['likelihood_pos'] = (
                self.model.predict(test_data.data))
        return test_data.data

示例#12

0

显示文件

文件： Mess_executor.py 项目： Simple-ape/MSI_theory

 def __init__(self,
              input_path,
              nominal_file,
              perturb_dict,
              nominal_dict,
              abstraction=False):
     self.input_path = input_path
     self.input_name = nominal_file
     self.nominal_model = preprocessor.Preprocessor(input_path +
                                                    nominal_file)
     self.nominal_model.clean_input()
     self.nominal_model.generate_species_classes(abstraction=abstraction)
     self.perturb_dict = perturb_dict
     self.nominal_dict = nominal_dict
     self.abstraction = abstraction

示例#13

0

显示文件

def main():
    prep = preprocessor.Preprocessor()
    # BUG:
    raw_img_data = prep.read_img_jpeg_bytes("./pedestrain.jpg")
    warm_up(prep)

    server_addr = "/tmp/coin_dl_server"
    client_addr = "/tmp/coin_dl_client"

    for addr in [client_addr, server_addr]:
        try:
            os.remove(addr)
        except OSError:
            pass

    main_loop(server_addr, client_addr, prep, raw_img_data)

示例#14

0

显示文件

 def warm_up(self, det, mode):
     self.logger.info("Warm-up the detector")
     start = time.time()
     raw_img_data = det.read_img_jpeg_bytes("./pedestrain.jpg")
     if mode == "raw":
         # Warm up the session, first time inference is slow
         ret = det.inference(raw_img_data)
         ret = det.get_detection_results(*ret)
     elif mode == "preprocessed":
         prep = preprocessor.Preprocessor()
         compressed_img_data = prep.inference(raw_img_data, 70)
         ret = det.inference(compressed_img_data)
         ret = det.get_detection_results(*ret)
     duration = time.time() - start
     self.logger.info(
         f"Warm-up the mode {mode} finished! Takes {duration} seconds")

示例#15

0

显示文件

def generate_files():
    check_nltk_resources()
    check_paths()

    p = preprocessor.Preprocessor()
    df = p.load_dataset()

    p.preprocess_synopses(df)
    p.preprocess_genres(df)
    p.build_indexes()
    if settings.USE_W2V:
        p.generate_embedding_weights()
    p.filter_dataset()
    p.encode_genres()
    p.encode_synopses()
    p.save_data()

示例#16

0

显示文件

def get_predictions(g, n):
    possible_genres = list(g.mlb.classes_)
    print("Possible film genres: ", ','.join(possible_genres))
    input_line = 'r'  #input("Insert a comma separated set of genres (r for random, q for quit): ")
    if input_line == 'q':
        exit()
    randomly = input_line == 'r'
    p = preprocessor.Preprocessor()
    if randomly:
        n_genres = random.randint(1, 6)
        input_genres = random.sample(possible_genres, n_genres)
    else:
        input_genres = input_line.split(',')
        for ig in input_genres:
            if ig not in possible_genres:
                print(ig + " is not a possible genre")
                get_predictions(g, n)
    print("Input genres: ", ', '.join(input_genres))
    encoded_genres = g.mlb.transform([input_genres])
    mode = input("Input g or b for greedy or beam search mode: ")
    previous_words = input("Introduce help/previous words (optional): ")
    previous_words = p.clean_text(previous_words)
    previous_words = p.tokenize(previous_words)[:-1]
    prvs = []
    for pw in previous_words:
        if pw in g.word_to_index.keys():
            prvs.append(pw)
        else:
            prvs.append(settings.UNKNOWN_TOKEN)
    if previous_words == '':
        previous_words = None
    print("Starting words: " + str(previous_words))
    if mode == 'g':
        print("Greedy search mode")
        syn = get_predictions_greedy(g, n, encoded_genres, previous_words)
    elif mode == 'b':
        print("Beam search mode")
        syn = get_predictions_beam(g=g,
                                   n=n,
                                   encoded_genres=encoded_genres,
                                   previous_words=previous_words)
    else:
        print("Wrong mode")
        get_predictions(g, n)
    print("Synopsis: ", syn)
    get_predictions(g, n)

示例#17

0

显示文件

文件： pipeline.py 项目： anwarower/CarND-Advanced-Lane-Lines

class FramePipeline:
    cam = cm.Camera()
    preprocessor = prep.Preprocessor()
    homographyOp = h**o.Homography()
    laneLinesFinder = None
    currOriginalFrame = None
    visualizer = None

    def __init__(self, frameWidth, frameHeight):
        self.frameWidth = frameWidth
        self.frameHeight = frameHeight
        self.cam.init(9, 6, 'camera_cal/calibration*.jpg')
        self.cam.calibrate()
        self.homographyOp.setFrameSize(frameWidth, frameHeight)
        self.homographyOp.estimateRoadHomography()
        self.laneLinesFinder = lf.LaneLinesFinder(frameWidth, frameHeight)
        self.visualizer = visu.Visualizer(self.laneLinesFinder, self)

    def processFrame(self, InputImg):

        self.currOriginalFrame = InputImg

        undistortedImg = self.cam.undistortImg(InputImg)

        sobelImg = self.preprocessor.extractEdges(undistortedImg, 'all')

        croppedImg = self.preprocessor.crop(sobelImg)

        rectImg = self.homographyOp.warp(croppedImg)

        warped_out = self.laneLinesFinder.findLane(rectImg)

        output = self.visualizer.visualizeFrame(rectImg)

        #only for the report at the end
        #cv2.imwrite('afterUndist.jpg', undistortedImg)
        #cv2.imwrite('afterSobel.jpg', sobelImg)
        #cv2.imwrite('afterCropping.jpg', croppedImg)
        #cv2.imwrite('afterRectifying.jpg', rectImg)
        #cv2.imwrite('afterFitting.jpg', warped_out)
        #cv2.imwrite('afterWarpingBack.jpg', output)
        #cv2.waitKey()
        return output

示例#18

0

显示文件

文件： test_preprocessor.py 项目： wdebsqi/spam-recognizer

    def __init__(self, methodName):
        super().__init__(methodName)

        self.preprocessor = preprocessor.Preprocessor()
        column_names = ['label', 'text']
        data_to_process = {
            'label': ['ham', 'ham', 'spam'],
            'text': [
                'Not normalized   #$text', '   AnOthEr not normalized TEXT',
                'Not normalized     SPAM'
            ],
        }
        self.dataset_to_process = pd.DataFrame(data_to_process,
                                               columns=column_names)

        correct_data = {
            'label': [0, 0, 1],
            'text':
            ['normalized text', 'another normalized text', 'normalized spam'],
        }
        self.correct_dataset = pd.DataFrame(correct_data, columns=column_names)

示例#19

0

显示文件

 def test2():
     '''
     ===========================================================
     Following steps:
         1. Add numbers and punct
         2. No tokenizer
         3. Stop words
         4. pos tag
     ===========================================================
     '''
     print('running preprocessor test 2 ...')
     pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]')
     analyzer = maru.get_analyzer(tagger='linear')
     config = preprocessor.Config(regexp=pattern,
                                  stopwords=stopwords_set,
                                  analyzer=analyzer,
                                  with_pos_tag=True,
                                  remove_stop_words=False,
                                  lemmatize=True,
                                  tokenizer=None)
     pipeline = preprocessor.Preprocessor(config)
     case = [
         'Так говорила в июле 1805 года известная',
         '— Как можно быть здоровой... когда нравственно страдаешь?',
         'праздник отменен, Je vous avoue que toutes ces fêtes'
     ]
     expected = [[
         'так_ADV', 'говорить_VERB', 'в_ADP', 'июль_NOUN', '1805_NUM',
         'год_NOUN', 'известный_ADJ'
     ],
                 [
                     'как_CONJ', 'можно_ADJ', 'быть_VERB',
                     'здоровой..._CONJ', 'когда_CONJ', 'нравственно_ADV',
                     'страдаешь?_PRON'
                 ], ['праздник_NOUN', 'отменен,_VERB']]
     res = pipeline.fit(case).transform(case)
     for res_line, expected_line in zip(res, expected):
         assert compare(res_line, expected_line), \
             'failed with {} and {}'.format(res_line, expected_line)
     print('test 2 passed')

示例#20

0

显示文件

 def test4():
     '''
     ===========================================================
     Following steps:
         1. Add numbers and punct
         2. Razdel tokenizer
         3. Stop words
         4. No pos tag
         5. No lemmatization
     ===========================================================
     '''
     print('running preprocessor test 3 ...')
     pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]')
     analyzer = maru.get_analyzer(tagger='linear')
     tokenizer = razdel
     config = preprocessor.Config(regexp=pattern,
                                  stopwords=stopwords_set,
                                  analyzer=analyzer,
                                  with_pos_tag=False,
                                  remove_stop_words=False,
                                  lemmatize=False,
                                  tokenizer=tokenizer)
     pipeline = preprocessor.Preprocessor(config)
     case = [
         'Так говорила в июле 1805 года известная',
         '— Как можно быть здоровой... когда нравственно страдаешь?',
         'праздник отменен, Je vous avoue que toutes ces fêtes'
     ]
     expected = [[
         'так', 'говорила', 'в', 'июле', '1805', 'года', 'известная'
     ],
                 [
                     'как', 'можно', 'быть', 'здоровой', '...', 'когда',
                     'нравственно', 'страдаешь', '?'
                 ], ['праздник', 'отменен', ',']]
     res = pipeline.fit(case).transform(case)
     for res_line, expected_line in zip(res, expected):
         assert compare(res_line, expected_line), \
             'failed with {} and {}'.format(res_line, expected_line)
     print('test 4 passed')

示例#21

0

显示文件

文件： detector.py 项目： stevelorenz/build-vnf

def test():
    raw_img_data = Detector.read_img_jpeg_bytes("./pedestrain.jpg")
    det = Detector(mode="raw")
    ret = det.inference(raw_img_data)
    resp = det.get_detection_results(*ret)
    print("*** Inference result of raw image!")
    print(resp)
    del det
    gc.collect(1)
    gc.collect(2)

    # Test detection of preprocessed image
    prep = preprocessor.Preprocessor()
    compressed_img_data = prep.inference(raw_img_data, 70)
    print(
        f"*** Raw image size: {len(raw_img_data)}B, preprocessed image size: {len(compressed_img_data)}B"
    )
    det = Detector(mode="preprocessed")
    ret = det.inference(compressed_img_data)
    resp_prep = det.get_detection_results(*ret)
    print("*** Inference result of preprocessed image!")
    print(resp_prep)

示例#22

0

显示文件

    def doc2vec(self, sentences):
        fname = get_tmpfile('doc2vec.model')
        edited_sentences = {}
        train_corpus = []
        count = 0
        for index, sentence in sentences.items():
            processed_sentence = preprocessor.Preprocessor(
                sentence).preprocessData()
            if not processed_sentence:
                continue
            else:
                tokens = gensim.utils.simple_preprocess(processed_sentence)
                train_corpus.append(TaggedDocument(tokens, str(count)))
                edited_sentences[count] = sentence
                count = count + 1

        model = Doc2Vec(train_corpus,
                        vector_size=10,
                        dbow_words=1,
                        dm=1,
                        window=2,
                        min_count=2)
        return (model, train_corpus, edited_sentences)

示例#23

0

显示文件

 def test1():
     '''
     ===========================================================
     Full house:
         1. Leave only alphabet characters
         2. Remove stop words
         3. Lemmatize and add pos tags
     ===========================================================
     '''
     print('running preprocessor test 1 ...')
     pattern = re.compile(r'[^а-яА-я ё]')
     analyzer = maru.get_analyzer(tagger='linear')
     config = preprocessor.Config(regexp=pattern,
                                  stopwords=stopwords_set,
                                  analyzer=analyzer,
                                  with_pos_tag=True,
                                  remove_stop_words=True,
                                  lemmatize=True,
                                  tokenizer=None)
     pipeline = preprocessor.Preprocessor(config)
     case = [
         'Так говорила в июле 1805 года известная',
         '— Как можно быть здоровой... когда нравственно страдаешь?',
         'праздник отменен, Je vous avoue que toutes ces fêtes'
     ]
     expected = [
         ['говорить_VERB', 'июль_NOUN', 'год_NOUN', 'известный_ADJ'],
         ['здоровый_ADJ', 'нравственно_ADV', 'страдать_VERB'],
         ### here is the case when lemmatization fails
         ['праздник_NOUN', 'отменный_ADJ']
     ]
     res = pipeline.fit(case).transform(case)
     for res_line, expected_line in zip(res, expected):
         assert compare(res_line, expected_line), \
             'failed with {} and {}'.format(res_line, expected_line)
     print('test 1 passed')

示例#24

0

显示文件

 def getNews(self):
     links = self.getGoogleLinks()
     print(len(links))
     news = {}
     for item in range(self.number):
         # Get the text of article
         date = int(links[item][0])
         news[date] = {}
         link = links[item][1]
         article = requests.get(link)
         soup = BeautifulSoup(article.text, "html.parser")
         for script in soup(["script", "style", "meta", "noscript"]):
             script.extract()  # rip it out
         text = soup.get_text()
         # Get the source
         source_1 = re.search('\.\\s*([^.]*)', link).group(1)
         source_2 = re.search('//\\s*([^.]*)', link).group(1)
         if "/" in source_1:
             source = source_2
         else:
             source = source_1
         news[date]['source'] = source
         news[date]['text'] = preprocessor.Preprocessor(text).preprocessData()
     return news

示例#25

0

显示文件

文件： multi.py 项目： philippeqc/Voodoo-Mock

    try:
        output += voodoo.voodooExpectSource(
            input=fullName,
            output=fullOutput,
            pathToRemoveFromIdentifier=inputPath,
            voodooDBFile=args.voodooDB,
            includes=args.includePath,
            defines=args.define,
            trace=False,
            preIncludes=args.preInclude)
        state = "V"
    except Exception, e:
        if str(e).find("all argume") != -1:
            raise
        inputLines = voodoo._readLinesOfFile(fullName)
        prepro = preprocessor.Preprocessor(fullName, fullOutput, inputLines,
                                           inputPath)
        output += prepro.intercepter()
        output += "\n/* The error that forced interception:\n" + \
                    str( e ).replace( "*/", "* /" ) + "\n"
        output += "\n"
        output += "Voodoo stack trace:\n" + traceback.format_exc()
        output += "*/\n"
        output += "\n"
        state = "I"
    f = file(fullOutput, "w")
    f.write(output)
    f.flush()
    f.close()

    sys.stdout.write("  <%d/%d> %s  %s\n" % (1 + fileList.index(
        (fullName, inputPath)), len(fileList), state, fullOutput))

示例#26

0

显示文件

import csv
import matplotlib.pyplot as plt
import pandas as pd

from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import EarlyStopping

import preprocessor
import featureanalysis

# Above this value, the survival flag will be true
PROBABILITY_MARGIN_SURVIVAL = 0.5

prepr = preprocessor.Preprocessor()
prepr.process_training_dataset('train.csv')

df = pd.read_csv('train.csv')

# perform feature analysis
numerical_features = ["Survived", "SibSp", "Parch", "Age", "Fare"]
feat_analysis = featureanalysis.FeatureAnalysis(df)
feat_analysis.get_correlation_numerical_values(numerical_features)

# removed cabin and name columns
input_value, output = prepr.get_train_datasets()

# Get number of columns in training data
n_cols = input_value.shape[1]

示例#27

0

显示文件

import sys

import preprocessor

CMD_OVERWRITE_OPTION = '-ow'

if __name__ == "__main__":
    # Check arguments
    if len(sys.argv) >= 3:
        # At least 2 arguments have been passed
        inp = sys.argv[1]
        out = sys.argv[2]
        # Define overwrite option
        overwrite = len(sys.argv) >= 4 and sys.argv[3] == CMD_OVERWRITE_OPTION
        p = preprocessor.Preprocessor(preprocessor.Language.vietnamese)
        try:
            p.preprocess_files(inp, out, {'overwrite': overwrite})
        except (FileNotFoundError, FileExistsError) as errors:
            for e in errors.args:
                if e:
                    print(e)
    else:
        print('Missing arguments. Arguments: input output [-ow]')

示例#28

0

显示文件

文件： testfile.py 项目： Leopaexd/LIN503_Project

from extractor import Extractor
import preprocessor
import dictionary
import vectorizer
import dataset_divider
import classifier
import time

start = time.time()
PreProcessor = preprocessor.Preprocessor()
Dictionary = dictionary.Dictionary()
categories = [1, 2, 3]  # Categories to be includes

lines = []
for category in categories:
    lines.append(
        Extractor.extract(('flashback' + str(category) + '.json'),
                          ('extracted' + str(category) + '.txt')))
    dataset_divider.Divider.divide(('extracted' + str(category) + '.txt'),
                                   lines[len(lines) - 1])

# pre-processing of training data
processed = []
processed_test = []
for category in categories:
    processed.append(
        PreProcessor.preprocess('training' + str(category) + ".txt"))
    processed_test.append(
        PreProcessor.preprocess('testing' + str(category) + ".txt"))

with open("testingposts.txt", "w") as file:

示例#29

0

显示文件

import argparse

import preprocessor
from definitions import TEST_PROCESSED_PATH, TRAIN_PROCESSED_PATH


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', '--train', action='store_true')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    preprocess = preprocessor.Preprocessor(train=args.train, dl=False)
    preprocess_data = preprocess.clean_data()
    path = TRAIN_PROCESSED_PATH if args.train else TEST_PROCESSED_PATH
    preprocess_data.to_csv(path, encoding='utf-8', index=False)

示例#30

0

显示文件

文件： main.py 项目： satyaborg/stylometric-analyser

def main():
    """Main method for controlling the flow of the stylometric analyser.

    Function for creating of objects for word, character, punctuation, word length
    etc analysis.= to determine the patterns of styles in different works.

    """

    #Column names
    colnames = ['work', 'char_freq', 'punc_freq', 'stop_freq', 'word_len_freq']
    #Initializing an empty dataframe to store all stats after analysis
    all_text_stats = pd.DataFrame(columns=colnames)

    #Try block
    try:
        #-----------------------------Analysis----------------------------------
        #Main loop for doing the analysis file by file
        for work in works:
            #calling read_input function to read the content of each file
            content = read_input(work)

            #Creating object for preprocessor class
            pre_processor = prpscr.Preprocessor()
            pre_processor.tokenise(content)
            #Fetching the tokens
            tokens = pre_processor.get_tokenised_list()

            #Creating object for CharacterAnalyser class
            char_analyser = char.CharacterAnalyser()
            #Analysing at character level
            char_analyser.analyse_characters(tokens)
            #Fetching the character occurences
            ch_occ = char_analyser.char_occ
            #Fetching the punctuation occurences
            punc_occ = char_analyser.get_punctuation_frequency()

            #Creating object for WordAnalyser class
            word_analyser = word.WordAnalyser()
            #Analysing at word level
            word_analyser.analyse_words(tokens)
            #Fetching the stop word occurences
            stop_occ = word_analyser.get_stopword_frequency()
            #Fetching the word length occurences
            word_len_occ = word_analyser.get_word_length_frequency()

            #Temporary df to store all the analysis for one text at a time
            temp_df = pd.DataFrame(
                [[work, ch_occ, punc_occ, stop_occ, word_len_occ]],
                columns=colnames)

            all_text_stats = all_text_stats.append(temp_df, ignore_index=True)

        #-----------------------------Visualisation-----------------------------
        #Creating object for Visualiser class
        visualiser = vis.AnalysisVisualiser(all_text_stats)
        #Visualising punctuation frequencies in all the works
        visualiser.visualise_punctuation_frequency()
        #Visualising character frequencies in all the works
        visualiser.visualise_character_frequency()
        #Visualising stopword frequencies in all the works
        visualiser.visualise_stopword_frequency()
        #Visualising word length frequencies in all the works
        visualiser.visualise_word_length_frequency()

    #Catch for exceptions
    except ImportError as err:
        print(
            'IMPORT ERROR :', err,
            '. Please check the working directory, name or ' +
            'make sure that module is imported!')
    except TypeError as err:
        print('TYPE ERROR :', err)
    except IndexError as err:
        print('INDEX ERROR :', err)
    except ValueError as err:
        print('VALUE ERROR :', err)
    except IOError as err:
        print('INPUT ERROR :', err, '. Please check the path of the file!')
    except requests.RequestException as err:
        print('REQUEST ERROR :', err)
    except:
        print('UNEXPECTED ERROR!')