示例#1
0
def preprocess_savefig(root, progressbar, paths, params, type):

    if not os.path.isdir(os.path.join(root, "preprocessed")):
        for x in ["individual", "joined"]:
            for y in ["healthy", "defective"]:
                os.makedirs(os.path.join(root, "preprocessed", x, y))
    
    progressbar["value"] = 0
    progressbar["maximum"] = len(paths)

    _, img_sample = preprocess(img_path=paths[0][0], params=params)
    shape = img_sample.shape
    imgs = []
    for p in range(len(paths)):
        img_join = np.zeros((shape[0]*2, shape[1], shape[2]), dtype='uint8')
        for n in range(2):
            _, img_np = preprocess(img_path=paths[p][n], params=params)
            img_join[n*shape[0]: (n+1)*shape[0]] = img_np
            img_pil = Image.fromarray(img_np)
            img_pil.save(os.path.join(root, "preprocessed", "individual", type, f"bean{p+1}_side{n+1}.png"))
        img_pil = Image.fromarray(img_join)
        img_pil.save(os.path.join(root, "preprocessed", "joined", type, f"bean{p+1}.png"))

        progressbar["value"] += 1
        progressbar.update()
    progressbar["value"] = 0
    progressbar.update()
def main():
    train_file ="/Users/phx/downloads/competetion/recipe/train.json"
    with open(train_file) as file:
        data = json.load(file)
    print("size of dataset %d" % len(data))

    data = preprocess(data)
    data = preprocess(data)

    train_data = [data[i] for i in xrange(0,len(data)) if i%3 !=0]
    test_data = [data[i] for i in xrange(0,len(data)) if i%3 ==0]

    #test_data= preprocess(test_data)

    attribute_map = getAttributeMap(train_data,1)

    print('attribute number : %d' % len(attribute_map))
    print(attribute_map)


    label_map = getLabelMap(data)
    print('label number : %d' %len(label_map))
    print(label_map)
    X,y = getDataSet(train_data,attribute_map,label_map)
    testX,testY= getDataSet(test_data,attribute_map,label_map)
    sgd = SGDClassifier(loss='log')
    generate_save_proba(sgd,X,y,testX,testY,"SGDClassifier.loss_log")
    mnb = MultinomialNB(alpha=0.08, class_prior=None, fit_prior=True)
    generate_save_proba(mnb,X,y,testX,testY,"MultinomialNB.alpha_0.08")
    rf = RandomForestClassifier(n_estimators=500)
    generate_save_proba(rf,X,y,testX,testY,"RandomForestClassifier.n_estimators_500")
    """
示例#3
0
def run_extractor():
    """Run the full extraction pipeline"""

    subprocess.call('mkdir data/reviews', shell=True)
    subprocess.call('mkdir data/tagged', shell=True)
    subprocess.call('mkdir data/untagged', shell=True)
    subprocess.call('mkdir data/to_parse', shell=True)
    subprocess.call('mkdir data/parsed/', shell=True)

    preprocessor.preprocess()

    subprocess.call('javac -cp ./external/stanford-postagger.jar Tagger.java',
                    shell=True)
    subprocess.call(
        'java -cp ".:./external/stanford-postagger.jar" -Xmx1024m Tagger ./external/left3words-wsj-0-18.tagger data/reviews data/tagged data/untagged',
        shell=True)

    multiword_attr_identifier.identify_multiword_attrs()
    parser_preparation.pre_parse()
    parser.parse_parallel(4)
    extraction_generator.generate_extractions()
    common_extraction_generator.generate_common_extractions()
    attribute_classifier.classify()
    extraction_filterer.filter_extractions()
    polarity_computer.compute_polarities()
示例#4
0
def main():

    choice = input(
        "Do you want to clean the original the files first ? (Y/N) \n")
    if choice == 'y' or choice == 'Y':
        i = 0

        # To get the list of files avoiding the hidden files that may start with '.' or '~'
        original_filelist = [
            f for f in listdir(original_filepath)
            if not (f.startswith('.') or f.startswith('~'))
        ]

        for filename in original_filelist:
            i += 1
            print(filename)
            preprocess(filename)
            print(i, "file cleaning done")

    choice = input("Do you want to process the cleaned files ? (Y/N) \n")

    if choice == 'y' or choice == 'Y':

        for size in windowsizes:
            window_generator(size)

    perform_calc()
示例#5
0
def ip_16_32_count(file_name):
    # adding ip count, network(IP/16bit) count to 'stat_dict' by preprocess.py
    stat_dict = {'ip':{}, 'network':{}}
    for fname in tqdm(file_name, total=len(file_name)):
            with open(rf"{data_path}{os.sep}{fname}", "rb") as file:
                pk = pickle.load(file)
                preprocess(pk, stat_dict)
    return stat_dict
示例#6
0
def get_input(image, boxes):
    images = get_cropped_images(boxes, image)
    preprocessed_images = [
        normalize(images),
        preprocess(images, histogram_stretching),
        preprocess(images, histogram_equalization), boxes
    ]
    return preprocessed_images
示例#7
0
def get_input(image, boxes):
    images = get_cropped_images(boxes, image)
    adeq_images = preprocess(np.array([image]),
                             adaptive_histogram_equalization)[0]
    preprocessed_images = [
        preprocess(images, histogram_equalization),
        preprocess(images, histogram_stretching),
        get_cropped_images(boxes, adeq_images), boxes
    ]
    return preprocessed_images
示例#8
0
def get_preprocessed_images(images):
    images = [
        normalize(images),
        preprocess(images, histogram_stretching),
        preprocess(images, histogram_equalization)
    ]
    images = [
        np.array([resize(img, (256, 256)) for img in imgs]) for imgs in images
    ]
    return images
def process_link(link):
    """
    Processes the given link, does some noise removal
    and return the detailed page in form of a HTMLNode object
    :param str link:
    :return: list of HTMLNode
    """
    website = constants.website

    # Build the right website link
    if link[0] == '/':
        if re.search('(^((http[s]{0,1}://)?www\.)?.+\.[a-z]+)/', website) is None:
            constants.logger.error('Unknown website link format')
            return None
        else:
            site_pref = re.findall('(^((http[s]{0,1}://)?www\.)?.+\.[a-z]+)/', website)[0][0]
            website = site_pref + link
    else:
        website = link

    # Launch website and get HTML code
    try:
        response = urllib.request.urlopen(website)
    except urllib.error.URLError:
        constants.logger.error('Page:"%s" was not able to launch' % website)
        return None
    source = response.read().decode('latin-1')

    # Transfer HTML code via easyhtml
    dom_parser = parser.DOMParser()
    dom_parser.feed(str(source))
    document = dom_parser.get_dom()

    # Finding the html node
    html_object = None
    for node in document.elements:
        if isinstance(node, easyhtml.dom.HTMLTag):
            if node.tag_name == 'html':
                html_object = node
                break
    if html_object is None:
        constants.logger.error('No html tag was found on detailed page')
        return None

    # Transforming the dom tree into the built in data objects
    # of HTMLNodes
    detailed_page = HTMLNode(html_object, 0)

    # preprocessing and noise removal
    preprocessor.preprocess(detailed_page)
    preprocessor.remove_noise_dp(detailed_page)

    # Finding and returning main text
    return detailed_page
示例#10
0
def get_inputs(images, boxes):
    cropped_images = np.array([imgs for i in range(0, len(images)) for imgs in get_cropped_images(boxes[i], images[i])])
    flattened_boxes = np.array([values for _boxes in boxes for values in _boxes])

    preprocessed_images = [
        normalize(cropped_images),
        preprocess(cropped_images, histogram_stretching),
        preprocess(cropped_images, histogram_equalization),
        flattened_boxes
    ]
    return preprocessed_images
示例#11
0
def main():
    macros = preprocessor.load_pokecrystal_macros()
    macro_table = preprocessor.make_macro_table(macros)

    stdout = sys.stdout

    for source in sys.argv[1:]:
        dest = os.path.splitext(source)[0] + '.tx'
        sys.stdin = open(source, 'r')
        sys.stdout = open(dest, 'w')
        preprocessor.preprocess(macro_table)

    # reset stdout
    sys.stdout = stdout
示例#12
0
def main():
    macros = preprocessor.load_pokecrystal_macros()
    macro_table = preprocessor.make_macro_table(macros)

    stdout = sys.stdout

    for source in sys.argv[1:]:
        dest = os.path.splitext(source)[0] + '.tx'
        sys.stdin  = open(source, 'r')
        sys.stdout = open(dest, 'w')
        preprocessor.preprocess(macro_table)

    # reset stdout
    sys.stdout = stdout
def get_num_episodes():
    while True:
        try:
            with open(info_filename, 'r') as f:
                lines = f.readlines()
                #last line is blank.
                num_eps = len(lines) - 1
                assert num_eps > 0
                break
        except:
            print('preprocessing...')
            preprocessor.preprocess()
    print('preprocessing completed')
    return num_eps
示例#14
0
def main():
    config = configuration.Config()
    macros = preprocessor.load_pokecrystal_macros()

    stdout = sys.stdout

    for source in sys.argv[1:]:
        dest = os.path.splitext(source)[0] + '.tx'
        sys.stdin  = open(source, 'r')
        sys.stdout = open(dest, 'w')
        preprocessor.preprocess(config, macros)

    # reset stdout
    sys.stdout = stdout
示例#15
0
def improved_indexer(documents):
    index = {}
    m = len(documents)
    doc_lengths = {}

    stopword_file = open(os.path.join(os.path.dirname(__file__), 'stopword_list.txt'), 'r')
    stopword_list = []

    for line in stopword_file:
        stopword_list.append(line.rstrip())

    termlist = {}
    for recordnum in documents:
        document = documents[recordnum]['text']
        doc_lengths[recordnum] = []

        for (i, field) in enumerate(document):
            priority = i
            tokens = preprocess(field, stopword_list)
            for token in tokens:
                if (token, priority) in index:
                    if recordnum in index[(token, priority)]:
                        index[(token, priority)][recordnum] += 1
                    else:
                        index[(token, priority)][recordnum] = 1
                else:
                    index[(token, priority)] = {recordnum: 1}

                if token in termlist:
                    if recordnum not in termlist[token]:
                        termlist[token].append(recordnum)
                else:
                    termlist[token] = [recordnum]

            doc_lengths[recordnum].append(len(tokens))

    all_doc_lengths = [doc_lengths[recordnum] for recordnum in doc_lengths]
    doc_lengths_avg = numpy.average(numpy.matrix(all_doc_lengths), axis=0).tolist()[0]

    doc_lengths['avg'] = doc_lengths_avg

    enhanced_index = {}
    d = defaultdict(list)

    for word, priority in index:
        d[word].append(priority)
    terms = dict((k, v) for (k, v) in d.items())

    for word in terms:

        docs_with_word = termlist[word]
        idf = log10((m+1.0) / len(docs_with_word))

        for priority in terms[word]:
            enhanced_index[(word, priority)] = {}

            for document in index[(word, priority)]:
                enhanced_index[(word, priority)][document] = [index[(word, priority)][document], idf]

    return [enhanced_index, doc_lengths]
def process():

    # Mengakses data form dari request HTTP
    text = request.form.get("text", "")

    # Melakukan preprocessing
    text = preprocess(text)

    # Melakukan tagging
    text = tag(text, "http://localhost:7000")

    # Melakukan chunking
    text = chunk(text)

    # Melakukan proses normalisasi
    text = normalize(text)

    # Membuat response HTTP dengan format JSON yang berisi teks yang telah diproses
    return jsonify({
        "status": "success",
        "message": "Request successful",
        "data": {
            "text": text
        }
    })
示例#17
0
def get_topic_sentiment_nltk(topic_keywords):
    topic_max_distance = [0]
    topic_min_distance = [0]
    topic_sarcastic = False

    for topic_keyword in topic_keywords:
        tweets = get_tweets_for_feature_extraction(topic_keyword, 3)

        tweets_positive = [0]
        tweets_negative = [0]
        tweets_sarcastic = False
        for tweet in tweets:
            processed_tweet = preprocess(tweet["text"])
            processed_text = processed_tweet["text"]

            tokens = nltk.word_tokenize(processed_text)
            tokens = [(t.lower()) for t in tokens]

            mean_sentiment = sentiment_helper.score_sentence(tokens)
            positive_sentence_sentiment = mean_sentiment[0]
            negative_sentence_sentiment = mean_sentiment[1]

            tweets_positive.append(positive_sentence_sentiment)
            tweets_negative.append(negative_sentence_sentiment)
            tweets_sarcastic = ("#sarcasm" in processed_tweet["hashtags"]) or tweets_sarcastic

        topic_max_distance.append(max(tweets_positive) - min(tweets_positive))
        topic_min_distance.append(max(tweets_negative) - min(tweets_negative))
        topic_sarcastic = topic_sarcastic or tweets_sarcastic

    return sum(topic_max_distance) / (len(topic_keywords) or 1), sum(topic_min_distance) / (len(topic_keywords) or 1), int(topic_sarcastic)
示例#18
0
def process_line(line):
    if line.strip() == '':  # Don't process empty lines any further
        if cCountEmptyLines:
            return "\\State", None, False, 0
        else:
            return "\\Statex", None, False, 0

    sp = line.split("#")
    comment = ""
    if len(sp) > 1:
        if len(sp[-2]) == 0 or not sp[-2][-1] == "\\":
            comment = sp[-1]
            line = "\\#".join(sp[:-1])
        else:
            if not len(sp[-2]) == 0:
                sp[-2] = sp[-2][:-1]
            line = "\\#".join(sp)

    comment = comment.strip()
    line = line.strip()
    line = preprocess(line)

    terminator = None
    process_lvl = False
    transform = 0
    if line == "":
        line = generate_comment_line(comment)
    else:
        keyword = get_keyword(line)
        generator = get_generator(keyword)
        line, terminator, process_lvl, transform = generator(line)
        if not comment == "":
            line += " \\Comment{\ " + comment + "}"

    return line, terminator, process_lvl, transform  # Add generated line to result
示例#19
0
def telemetry(sid, data):
    if data:
        # The current steering angle of the car
        steering_angle = data["steering_angle"]
        # The current throttle of the car
        throttle = data["throttle"]
        # The current speed of the car
        speed = data["speed"]
        # The current image from the center camera of the car
        imgString = data["image"]
        image = Image.open(BytesIO(base64.b64decode(imgString)))
        image_array = np.asarray(image)
        image_array = preprocessor.preprocess(image_array)
        steering_angle = float(
            model.predict(image_array[None, :, :, :], batch_size=1))

        throttle = controller.update(float(speed))

        print(steering_angle, throttle)
        send_control(steering_angle, throttle)

        # save frame
        if args.image_folder != '':
            timestamp = datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S_%f')[:-3]
            image_filename = os.path.join(args.image_folder, timestamp)
            image.save('{}.jpg'.format(image_filename))
    else:
        # NOTE: DON'T EDIT THIS.
        sio.emit('manual', data={}, skip_sid=True)
def run_configs(data_dir, reviews_filename):
    # directory where the preprocessed files will be stored
    preprocessed_dir = data_dir + "preprocessed_files/"

    # directory of raw data eg. {root}/data/electronics/reviews_Electronics_5
    filename = data_dir + reviews_filename

    # file endings
    raw = filename + ".json.gz"
    reviews = filename + "_reviews.txt"
    ratings = filename + "_ratings.npy"

    # possible preprocessing steps
    preprocess_steps = {
        "reg_lemma":
        ["clean", "regexp_tokenize", "remove_stop_words", "lemmatize"],
        # "reg_stem": ["clean", "regexp_tokenize", "remove_stop_words", "stem"],
        # "tw_lemma": ["clean", "tweet_tokenize", "remove_stop_words", "lemmatize"],
        # "tw_stem": ["clean", "tweet_tokenize", "remove_stop_words", "stem"],
    }

    for step in preprocess_steps:
        # generate a new filename eg. {root}/data/preprocessed_files/electronics/reviews_Electronics_5_tw_stem.txt
        preprocessed_filename = preprocessed_dir + filename.replace(
            data_dir, "") + "_" + step + ".txt"
        # if given file does not exist, preprocess input file with given steps and save it
        if not os.path.isfile(preprocessed_filename):
            preprocessed_texts = preprocessor.preprocess(
                reviews, preprocess_steps[step])
            preprocessor.save_texts(preprocessed_texts, preprocessed_filename)
示例#21
0
def recommend(inputs):
    recommendation_list = []
    all_recommendation = analyzer.recommend_start(inputs)
    features = preprocessor.preprocess(inputs)
    recommendations = lookup_table.lookup(features)
    recommendations.append(all_recommendation)
    return all_recommendation
示例#22
0
    def on_start_file_chooser_button_clicked(self, widget):
        window = self.shell_ui.get_object("all_window")
        dialog = Gtk.FileChooserDialog(
            title="Please choose a file",
            parent=window,
            action=Gtk.FileChooserAction.OPEN,
            buttons=(Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, Gtk.STOCK_OPEN,
                     Gtk.ResponseType.OK))
        response = dialog.run()
        if response == Gtk.ResponseType.OK:
            selected_file_path = dialog.get_filename()
            relative_path = os.path.basename(selected_file_path)
            inputfile = open(relative_path, "r")
            code = inputfile.read()
            lines = code.split('\n')
            finalfile = lines[0].split('.')[0] + '.8085'
            print(lines[0].split('.')[0])
            print(finalfile)

            entries_box = self.shell_ui.get_object("start_entries_box")
            wids = entries_box.get_children()
            for widget in wids:
                widget.destroy()
            i = 0
            print(lines)
            for line in lines:
                if line != '':
                    self.z.append(line)
                    label = Gtk.Label("Code" + str(i))
                    tv = Gtk.TextView()
                    tb = tv.get_buffer()
                    entries_box.add(label)
                    entries_box.add(tv)
                    i += 1
                    with open(line, "r") as file:
                        s = file.read()
                        tb.set_text(s)
                        print(s)
            self.shell_ui.get_object("start_entry_number_entry").set_text(
                str(i))
            entries_box.show_all()
            self.x = preprocess(self.z)
            processed_box = self.shell_ui.get_object("processed_box")
            i = 0
            for file_name in self.x:
                if file_name != '':
                    label = Gtk.Label("Code" + str(i))
                    tv = Gtk.TextView()
                    tb = tv.get_buffer()
                    processed_box.add(label)
                    processed_box.add(tv)
                    i += 1
                    with open(file_name, "r") as file:
                        s = file.read()
                        tb.set_text(s)
                        print(s)
            processed_box.show_all()
        elif response == Gtk.ResponseType.CANCEL:
            print("Cancel clicked")
        dialog.destroy()
示例#23
0
def translated_data():
    directory = 'C:\\Users\\olive\\Desktop\\Datasets_for_thesis\\Prisjakt\\training_data'
    extracted_data = extractor.json_extract(directory)
    extracted_reviews = extracted_data[0]
    polarities = extracted_data[1]
    preprocessed_reviews = preprocessor.preprocess(extracted_reviews)
    dictionary = dict.Dictionary(preprocessed_reviews).dictionary
    # review_translator.translate_reviews(preprocessed_reviews, polarities)
    with open('untranslated_reviews validation combined.txt', 'r') as file:
        untranslated_reviews = np.concatenate(
            vectorizer.vectorize_data(
                preprocessor.preprocess(file.readlines()), dictionary, 300))
    with open('translated_polarities validation combined.txt', 'r') as file:
        translated_polarities = []
        for line in file:
            translated_polarities.append(int(line))
    return [untranslated_reviews, np.array(translated_polarities)]
示例#24
0
def km(num):
    
    num = int(num)
    data = preprocess('data.txt', None , [] )
    X_principal = xnormalize(data)
    km_name = kmeans_cluster(X_principal , num)

    return {"figure": "cluster/" + km_name}
示例#25
0
    def test_success(self):
        list_docs = [
            'Hôm nay, tôi đi học. 12321 ', 'Hôm nay, trời 432 đẹp quá!'
        ]
        list_docs = preprocessor.preprocess(list_docs)

        transformer = Text2Vector()
        transformer.fit(list_docs)

        print('Most comment words: ', transformer.get_most_common(10))

        vec = transformer.doc_to_vec(
            preprocessor.preprocess(
                ['Hôm nay, tôi 332 đi học.', 'Hôm nay, 43 tôi đi chơi.!']))
        print('Vec: ', vec)
        text = transformer.vec_to_doc(vec)
        print('Text: ', text)
示例#26
0
 def __init__(self, content):
     self.content = content
     self.sents = preprocess(content)
     self.word2count = self.countword()
     self.k1 = 1.50
     self.b = 0.75
     self.stopWords = stopwords.words('english')
     self.title = None
 def test_preprocess_with_prefix_and_suffix(self):
     parts = {'qwe', 'wer'}
     prefix, suffix = 'prefix', 'suffix'
     s = ''.join((prefix, '{', '|'.join(parts), '}', suffix))
     result = set(preprocess(s))
     expected = set(''.join((prefix, '{', value, '}', suffix))
                    for value in parts)
     self.assertEqual(expected, result)
示例#28
0
def main(range_map_geodatabase_path, layer_name,
         forest_dependency_spreadsheet_path, global_canopy_cover_thresh,
         aoo_canopy_cover_thresh, altitude_limits_table_path,
         generation_lengths_table_path):
    """This function is the core of the application. It performs the pre-processing,
    analysis and post-processing.

    :param range_map_geodatabase_path: Path to an ESRI file geodatabase containing range
        maps to be analysed. See README for required format.
    :param layer_name: Name of the layer in the geodatabase at geodatabase_path
        containing the range maps to be analysed.
    :param forest_dependency_spreadsheet_path: Path to a spreadsheet containing species'
        forest dependency information. See README for required format.
    :param global_canopy_cover_thresh: Pixels in the "treecover2000" layer with an
        intensity less than this threshold are excluded from all computations: they
        are not counted as tree cover.
    :param aoo_canopy_cover_thresh: 2km by 2km grid cells containing a proportion of
        tree cover greater than aoo_canopy_cover_thresh are counted as forested cells
        for the purpose of AOO estimation.
    :param altitude_limits_table_path: Path to a CSV file containing species' minimum
        and maximum altitudes. See README for required format.
    :param generation_lengths_table_path: Path to a CSV file containing species'
        generation lengths. See README for required format.
    :return:
    """
    # Google Cloud Platform authentication.
    os.system('gcloud auth login')
    # Google Earth Engine authentication.
    ee.Authenticate()

    ee.Initialize()

    range_map_ic_gee_path = preprocess(range_map_geodatabase_path, layer_name,
                                       forest_dependency_spreadsheet_path)

    print_w_timestamp('Waiting for all GEE tasks to complete...')
    wait_until_all_tasks_complete()
    print_w_timestamp('Done.')

    if global_canopy_cover_thresh:
        if aoo_canopy_cover_thresh:
            analyse(altitude_limits_table_path, range_map_ic_gee_path,
                    global_canopy_cover_thresh, aoo_canopy_cover_thresh)
        else:
            analyse(altitude_limits_table_path, range_map_ic_gee_path,
                    global_canopy_cover_thresh)
    else:
        if aoo_canopy_cover_thresh:
            analyse(altitude_limits_table_path, range_map_ic_gee_path,
                    aoo_canopy_cover_thresh)
        else:
            analyse(altitude_limits_table_path, range_map_ic_gee_path)

    print_w_timestamp('Waiting for all GEE tasks to complete...')
    wait_until_all_tasks_complete()
    print_w_timestamp('Done.')

    postprocess(generation_lengths_table_path)
示例#29
0
    def input(self, s):
        self.lexer.lineno = ExtendedLineNo(1, 0)
        self.errors = []
        self.lexer.errors = self.errors
        self.token_gen = self.generator(self.lexer.token)

        ps = preprocess(s)

        return self.lexer.input(ps)
示例#30
0
def genSent():
    objPre = preprocess()
    objPre = objPre.load()
    sentences = obj.generateSent(objPre.word_to_index, 1000,
                                 objPre.index_to_word)
    print sentences[:5]
    print "writing " + str(len(sentences)) + " news"
    write_line = '\n'.join(sentences)
    open(FILE_NAME + '_sentences', 'w').write(write_line.encode('utf-8'))
示例#31
0
def process_message(msg, channel=None):
    slice = json.loads(msg)
    text = preprocess(slice['text'])
    reply = None
    yap_reply = get_yap(text)
    slice['tokens'] = get_tokens(yap_reply)
    slice['raw_yap'] = yap_reply
    submit_yapped(slice, channel)
    all_processed.append(slice)
示例#32
0
    def on_start_file_chooser_button_clicked(self, widget):
        window = self.shell_ui.get_object("all_window")
        dialog = Gtk.FileChooserDialog(title="Please choose a file", parent=window, action=Gtk.FileChooserAction.OPEN,
                                       buttons=(
                                           Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, Gtk.STOCK_OPEN,
                                           Gtk.ResponseType.OK))
        response = dialog.run()
        if response == Gtk.ResponseType.OK:
            selected_file_path = dialog.get_filename()
            relative_path = os.path.basename(selected_file_path)
            inputfile = open(relative_path, "r")
            code = inputfile.read()
            lines = code.split('\n')
            finalfile = lines[0].split('.')[0] + '.8085'
            print(lines[0].split('.')[0])
            print(finalfile)

            entries_box = self.shell_ui.get_object("start_entries_box")
            wids = entries_box.get_children()
            for widget in wids:
                widget.destroy()
            i = 0
            print (lines)
            for line in lines:
                if line != '':
                    self.z.append(line)
                    label = Gtk.Label("Code" + str(i))
                    tv = Gtk.TextView()
                    tb = tv.get_buffer()
                    entries_box.add(label)
                    entries_box.add(tv)
                    i += 1
                    with open(line, "r") as file:
                        s = file.read()
                        tb.set_text(s)
                        print(s)
            self.shell_ui.get_object("start_entry_number_entry").set_text(str(i))
            entries_box.show_all()
            self.x = preprocess(self.z)
            processed_box = self.shell_ui.get_object("processed_box")
            i = 0
            for file_name in self.x:
                if file_name != '':
                    label = Gtk.Label("Code" + str(i))
                    tv = Gtk.TextView()
                    tb = tv.get_buffer()
                    processed_box.add(label)
                    processed_box.add(tv)
                    i += 1
                    with open(file_name, "r") as file:
                        s = file.read()
                        tb.set_text(s)
                        print(s)
            processed_box.show_all()
        elif response == Gtk.ResponseType.CANCEL:
            print("Cancel clicked")
        dialog.destroy()
示例#33
0
def calculate_tf_idf_docs():
    #Get the list of documents and their data
    documents = fetch_documents()

    #Preprocess Documents
    preprocessed_documents = []
    for document in documents:
        preprocessed_documents.append(preprocess(document))

    documents = preprocessed_documents

    # Find the list of unique words in the document dataset
    list_of_words = []
    for document in documents:
        for word in document:
            if word not in list_of_words:
                list_of_words.append(word)

    N = len(documents) + 1

    # Generate vector for each document
    copy_documents = documents
    documents_vector = []
    for document in documents:
        doc_vector = []
        for word in list_of_words:
            #Calculate term frequency
            tf = 0
            for term in document:
                if term == word:
                    tf = tf + 1

            #Calculate document frequency
            df = 0
            for copy_document in copy_documents:
                if word in copy_document:
                    df = df + 1

            #Calculate tf-idf
            idf = math.log(N / df)
            tfidf = tf * idf
            doc_vector.append(tfidf)

        documents_vector.append(doc_vector)

    #Generate database
    db = {}
    db['list_of_words'] = list_of_words
    db['N'] = N
    db['documents_vector'] = documents_vector
    db['documents'] = documents

    #Save data to persistence storage
    pickle_out = open(PREPROCESSED_DATA, 'wb')
    pickle.dump(db, pickle_out)
    pickle_out.close()
示例#34
0
def interpret(text):
    text, line_nums, indent_str = preprocess(text)
    program = core_parser.parse(text, trace=False)
    context = Context(op_parser, keywords)
    for stmt in program:
        if isinstance(stmt, Block):
            context.keywords[stmt.keyword](stmt.header, stmt.body, context,
                                           context)
        else:
            print(cata(stmt, lambda ast: parse_ops(ast, context)))
示例#35
0
def load_image_and_steering_for_train(csv_line):

    correction = 0.2

    steering = float(csv_line[3])
    i = random.randint(0, 2)

    if i == 0:
        img = preprocessor.preprocess(cv2.imread(csv_line[0]))
    elif i == 1:
        img = preprocessor.preprocess(cv2.imread(csv_line[1]))
        steering = steering + correction
    else:
        img = preprocessor.preprocess(cv2.imread(csv_line[2]))
        steering = steering - correction

    img, steering = random_augment(img, steering)

    return img, steering
示例#36
0
def generate_ground_data(image_path):
    image, img_txt = read_image(image_path)
    copy = image.copy()
    image, segments, euler_list, central_x, central_y = preprocess(image)
    feature_list = get_feature_list(
        image, segments, euler_list, central_x, central_y)[0]
    classes_list = get_class_list(copy, segments)
    with open("%s" % img_txt, 'wb') as test:
        for char, feature in zip(classes_list, feature_list):
            test.write("%s %s\n" % (chr(char), ' '.join(map(str, feature))))
示例#37
0
def recognize(pic, dir_train_pics):
	pic = Image.open(pic)
	print 'preprocessor.preprocess'
	pic_preprocessed = preprocessor.preprocess(pic)
	block_array = []
	print 'spliter.split'
	spliter.split(pic_preprocessed, block_array)
	captcha = ""
	if len(block_array) == 4:
		print 'recognize_block_array'
		captcha = recognize_block_array(block_array, dir_train_pics)
	return captcha
示例#38
0
def recognize(pic, dir_train_pics):
	pic = Image.open(pic)
	#print 'preprocessor.preprocess'
	pic_preprocessed = preprocessor.preprocess(pic)
	block_array = []
	#print 'spliter.split'
	spliter.split(pic_preprocessed, block_array)
	captcha = ""
	if len(block_array) >= THRESHOLD_BLOCK_NUMBER:
		#print 'recognize_block_array'
		captcha = recognize_block_array(block_array, dir_train_pics)
	return captcha
def run_extractor():
  """Run the full extraction pipeline"""
  
  subprocess.call('mkdir data/reviews', shell=True)
  subprocess.call('mkdir data/tagged', shell=True)
  subprocess.call('mkdir data/untagged', shell=True)
  subprocess.call('mkdir data/to_parse', shell=True)
  subprocess.call('mkdir data/parsed/', shell=True)
  
  preprocessor.preprocess()
  
  subprocess.call('javac -cp ./external/stanford-postagger.jar Tagger.java', shell=True)
  subprocess.call('java -cp ".:./external/stanford-postagger.jar" -Xmx1024m Tagger ./external/left3words-wsj-0-18.tagger data/reviews data/tagged data/untagged', shell=True)
  
  multiword_attr_identifier.identify_multiword_attrs()
  parser_preparation.pre_parse()
  parser.parse_parallel(4)
  extraction_generator.generate_extractions()
  common_extraction_generator.generate_common_extractions()
  attribute_classifier.classify()
  extraction_filterer.filter_extractions()
  polarity_computer.compute_polarities()
示例#40
0
def assembler_to_hex(source_code, filename=None, preprocessor_only=False):
    """
    Convert a assembler program to `Tiny` machine code.

    Opcodes described at http://redd.it/1kqxz9
    """

    code = preprocess(source_code, filename or '<input>')

    if preprocessor_only:
        return '\n'.join(c.contents for c in code)

    return assemble(code)
示例#41
0
文件: view.py 项目: bwesterb/DCPU-16
 def assemble(self):
     text = self.editor.GetText()
     try:
         self.reset(False)
         self.program = assembler.parse(preprocessor.preprocess(text))
         self.emu.load(self.program.assemble())
         self.program_list.update(self.program.instructions)
         self.refresh_debug_info()
     except Exception as e:
         self.reset(False)
         dialog = wx.MessageDialog(self, str(e), 'Error',
             wx.ICON_ERROR | wx.OK)
         dialog.ShowModal()
         dialog.Destroy()
示例#42
0
 def __init__(self, program, mode = "MIPS"):
   super(Assembler, self).__init__()
   try:                   text = program.read()
   except AttributeError: text = program
   self.mode = mode.upper()
   self.registers = Registers(self.mode)
   lines = text.split("\n")
   lines = clean(lines, self.mode)
   instrs, data = split_sections(lines)
   self.memory = Memory()
   for d in data: self.memory.insert(d)
   instrs = preprocess(instrs, self.mode)
   self.labels = label_positions(instrs)
   self.instructions = [Instruction(instr) for instr in instrs]
示例#43
0
def get_frame(conf, sensor, location, start, end):
    df = pandas.DataFrame()

    # UUID of the data to retrieve
    uuid = conf[sensor][location]["uuid"]
    # IP Address of the Archiver
    server = conf["archiver"]
    # Port of the Archiver
    port = conf[sensor][location]["archiver_port"]

    # Title of the column in the frame
    title = sensor.title() + "_" + location

    # Get frame for each location
    tframe = get_data(uuid, server, port, title, start, end)

    df = preprocess(tframe)
    return df
    def extract_train(self, sentences, labels):
        ''' Extract feature vectors and numbered labels from training data.
        @param sentences: list of sentences to be extracted
        @param labels: literal labels of each sentence

        @return X: 2D numpy array, feature vectors, one sentence per row
        @return y: 1D numpy array, numbered label of each sentence
        '''
        literal_labels = list(set(labels))
        print "Labels: ", literal_labels
        y = np.array([literal_labels.index(l) for l in labels])

        sentences = [preprocess(s) for s in sentences]
        self.pre_calculate(sentences)

        Xs = []
        X = np.array([self._extract(s) for s in sentences])
        self.literal_labels = literal_labels
        return X, y
示例#45
0
def get_features_from_nltk(tweet):

    # is tweet sarcastic
    is_sarcastic = int("#sarcasm" in tweet["text"])

    processed_tweet = preprocess(tweet["text"])
    processed_text = processed_tweet["text"]

    tokens = nltk.word_tokenize(processed_text)
    tokens = [(t.lower()) for t in tokens]

    mean_sentiment = sentiment_helper.score_sentence(tokens)
    positive_sentence_sentiment = mean_sentiment[0]
    negative_sentence_sentiment = mean_sentiment[1]
    sentence_sentiment = mean_sentiment[0] - mean_sentiment[1]

    word_sentiments = []
    for word in processed_text.split(" "):
        if len(word) > 0:
            word_sentiment = sentiment_helper.score_word(word.lower())
            word_sentiments.append(word_sentiment)

    maximum_word_polarity = max([x[0] for x in word_sentiments])
    minimum_word_polarity = max([x[1] for x in word_sentiments])

    polarity_distance_max = maximum_word_polarity - sentence_sentiment
    polarity_distance_min = abs(minimum_word_polarity - sentence_sentiment)

    blob_text = TextBlob(processed_text)
    topic_keywords = blob_text.noun_phrases + processed_tweet["hashtags"] + processed_tweet["mentions"]
    topic_positive, topic_negative, topic_sarcasm = get_topic_sentiment_nltk(topic_keywords)

    return ["{0:.2f}".format(positive_sentence_sentiment),
            "{0:.2f}".format(negative_sentence_sentiment),
            "{0:.2f}".format(sentence_sentiment),
            "{0:.2f}".format(maximum_word_polarity),
            "{0:.2f}".format(minimum_word_polarity),
            "{0:.2f}".format(polarity_distance_max),
            "{0:.2f}".format(polarity_distance_min),
            "{0:.2f}".format(topic_positive),
            "{0:.2f}".format(topic_negative),
            topic_sarcasm,
            is_sarcastic]
示例#46
0
文件: yasR.py 项目: Niols/yasR
def main():
    try:
        print_title ()

        P = Params()
        P.load()
        P.check_all()

        files_to_rename = get_files_to_rename(P.INPUT_DIRS, P.VIDEO_EXTENSIONS)

        actions_to_process = preprocessor.preprocess(
            files      = files_to_rename,
            language   = P.LANGUAGE,
            output_dir = P.OUTPUT_DIR
        )

        processor.process(
            to_process  = actions_to_process,
            config_path = P.get_path(expanded=True),
            ACTION      = P.ACTION
        )

    except KeyboardInterrupt:
        print()
        log.info('Aborting.')
        exit(1)

    except ConnectionError:
        log.fail('Lost connection. Aborting.')
        exit(2)

    except ConnectionRefusedError:
        log.fail('Lost connection. Aborting.')
        exit(2)

    except KeyError as e:
        if e == 'EDITOR':
            log.fail('Could not find the environment variable EDITOR. Aborting.')
            exit(1)
        else:
            log.fail('Uncaught KeyError exception: %s. Aborting.' % e.args[0])
            exit(2)
示例#47
0
def get_features_from_text_blob(tweet):

    # is tweet sarcastic
    is_sarcastic = int("#sarcasm" in tweet["text"])

    # preprocess tweet content
    processed_tweet = preprocess(tweet["text"])
    processed_text = processed_tweet["text"]

    blob_text = TextBlob(processed_text)

    # measure sentiment features of tweet
    sentence_polarity = blob_text.sentiment.polarity
    sentence_subjectivity = blob_text.sentiment.subjectivity

    # calculate word based polarity to capture extreme expressions
    polarities = []
    for word in processed_text.split(" "):
        blob_word = TextBlob(word)
        polarities.append(blob_word.sentiment.polarity)

    maximum_word_polarity = max(polarities)
    minimum_word_polarity = min(polarities)

    # measure how extreme the most expressive is with respect to whole sentence
    polarity_distance_max = maximum_word_polarity - sentence_polarity
    polarity_distance_min = abs(minimum_word_polarity - sentence_polarity)

    # extract topic based sentiment values; combined polarity, subjectivity and any sarcasm clue
    topic_keywords = blob_text.noun_phrases + processed_tweet["hashtags"] + processed_tweet["mentions"]
    topic_polarity, topic_subjectivity, topic_sarcasm = get_topic_sentiment(topic_keywords)

    return ["{0:.2f}".format(sentence_polarity),
            "{0:.2f}".format(sentence_subjectivity),
            "{0:.2f}".format(maximum_word_polarity),
            "{0:.2f}".format(polarity_distance_max),
            "{0:.2f}".format(polarity_distance_min),
            "{0:.2f}".format(topic_polarity),
            "{0:.2f}".format(topic_subjectivity),
            topic_sarcasm,
            is_sarcastic]
示例#48
0
        #print key.center(80,'*')+'\n'
        #print item.Serialize()+'\n'

    for key,value in items.items():
        fp.write(key.center(70,'*')+'\n')
        fp.write(value.Serialize()+'\n')

    
if __name__ == '__main__':
    log.InitLog()       
    px=Parser()

    with open('IFC2X3_TC1.exp','rb') as fp:
    #with open('schema.exp','rb') as fp:
        px.parse(fp)

    dataset=px.dataset
    preprocess(dataset)
    with open('IFC2X3_TC1.json','w') as fp:
    #with open('schema.json','w') as fp:
        toJson(dataset.types,fp)
        toJson(dataset.entities,fp)
        toJson(dataset.rules,fp)
        toJson(dataset.functions,fp)

    generater=Generator(dataset)
    generater.generateCommonFiles()
    generater.generateTypes()
    generater.generateEntities()
    generater.generateIndexes()
示例#49
0
def parse(s, parser=None):
    if parser is None:
        parser = Parser(file_prefix='.d_parser_mach_gen')
    return parser.parse(preprocessor.preprocess(s)).structure
示例#50
0
		train_filename = sys.argv[i]
	i += 1

print >>sys.stderr, "reading labelled dataset from '" + train_filename + "'..."

input = open(train_filename, "r") if train_filename != "-" else sys.stdin

input.readline()

X = numpy.loadtxt(input, delimiter=",", dtype=numpy.uint8)

labels = X[:,0]
X=X[:,1:].astype(float)

print >>sys.stderr, "training KNN with", min(train_threshold, X.shape[0]), "training instances and k=", k, "..."
clf = make_classifier(preprocess(X[:train_threshold]), labels[:train_threshold], name="KNN", params=[k])

print >>sys.stderr, "making predicitions for", max(0,X.shape[0]-train_threshold), "instances ..."
predictions = clf.predict(preprocess(X[train_threshold:]))

print >>sys.stderr, "evaluating ..."

if verbose:
	for i in range(len(predictions)):
		print labels[train_threshold:][i], predictions[i]
		if labels[train_threshold:][i] != predictions[i]:
			print >>sys.stderr, "should be:", labels[train_threshold:][i], ", was:", predictions[i]
			put_image(X[train_threshold:][i], 0, sys.stderr)
			print >>sys.stderr
else:
	for i in range(len(predictions)):
示例#51
0
文件: test.py 项目: codehunks/cloud
from preprocessor import preprocess


print preprocess("Sankararaman case: Kanchi seers, other accused acquitted".split(' '))
 def extract(self, sentence):
     '''Extract the feature vector for a testing sentence. The sentence is first turned into a list of words and then the feature extraction logic is delegated to _extract.'''
     return self._extract(preprocess(sentence))
示例#53
0
	dir_path_step = '../../pics/gujinsuo/pics_step'
	dir_path_train = '../../pics/gujinsuo/pics_train/'

	deal_number = 10

	pic_step1 = 1
	pic_step2 = 2
	pic_step3 = 3

	for pic_ptr in xrange(deal_number):

		pic_ptr_str = str('%04d' % pic_ptr)
		image_path = dir_path_base + pic_ptr_str + '.jpg'

		pic = Image.open(image_path)
		pic_preprocessed = preprocessor.preprocess(pic)

		output_path = dir_path_step + str(pic_step1) + '/' + pic_ptr_str + '_' + str(pic_step1) + '.jpg'
		print output_path
		pic_preprocessed.save(output_path)

		block_array = []
		spliter.split(pic_preprocessed, block_array)
		for i in xrange(len(block_array)):
			output_path = dir_path_step + str(pic_step2) + '/' + pic_ptr_str + '_' + str(pic_step2) + '_' + str(i) + '.jpg'
			print output_path
			block_array[i].save(output_path)

	for pic_ptr in xrange(deal_number):

		pic_ptr_str = str('%04d' % pic_ptr)
示例#54
0
文件: main.py 项目: mmfrb/pln-projeto
  else:
    if sys.argv[1] == 'naivebayes' or sys.argv[1] == 'knn':
      annotated_texts = read('blog-gender-dataset.xlsx')

      training_set_len = 0.7 * len(annotated_texts)

      training_set = []
      test_set = []

      for (text,gender) in annotated_texts:
        if 'M' in gender:
          gender = 'M'
        else:
          gender = 'F'
        if len(training_set) < training_set_len:
          training_set.append((preprocess(text), gender))
        else:
          test_set.append((preprocess(text), gender))

      if sys.argv[1] == 'naivebayes':
        classifier = NaiveBayesClassifier(training_set)

      else:
        classifier = KNNClassifier(training_set, 5)

      print(calculate_metrics(test_set, classifier))

    else:
      print('Invalid classifier name. Choose from [naivebayes, knn]')
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
import numpy
from preprocessor import preprocess, getDataSet, getLabelMap, getAttributeMap
import json

train_file = "/Users/phx/downloads/competetion/recipe/train.json"
with open(train_file) as file:
    data = json.load(file)
print("size of dataset %d" % len(data))

data = preprocess(data)

train_data = [data[i] for i in xrange(0, len(data)) if i % 3 != 0]
test_data = [data[i] for i in xrange(0, len(data)) if i % 3 == 0]

# test_data= preprocess(test_data)

attribute_map = getAttributeMap(train_data, 1)

print("attribute number : %d" % len(attribute_map))
print(attribute_map)


label_map = getLabelMap(data)
print("label number : %d" % len(label_map))
print(label_map)
__author__ = 'phx'

import numpy
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from preprocessor import preprocess,getAttributeMap,getLabelMap,getDataSet
import json
train_file ="train.json"
with open(train_file) as file:
     data = json.load(file)
print(len(data))

data = preprocess(data)
"""
train_data = [data[i] for i in xrange(0,len(data)) if i%4 !=0]
test_data = [data[i] for i in xrange(0,len(data)) if i%4 ==0]
"""
train_data = data


test_file ="/Users/phx/downloads/competetion/recipe/test.json"
with open(test_file) as file:
     test_data = json.load(file)
print(len(test_data))
test_data= preprocess(test_data)


attribute_map = getAttributeMap(train_data,1)
示例#57
0
            print "filename: ", xml_filename

            text_filepath = outputloc + xml_filename + ".txt"

            pt = ParseText(xml_filepath, text_filepath)
            content = pt.readXmlToString()

            #content_list variable is not used anymore, but is still used in the getBio code that I left so I left this here too
            #content_list = pt.readXMLToList()
            if content in xmlset: continue # Skip duplicates
            xmlset.add(content)

            soup = BeautifulSoup(content, "html.parser")

            #Preprocess using preprocessor.py
            preprocess(soup)

            #Get headings
            headings = pt.findHeadings(PROBABLE_HEADINGS, soup)
            headingsclean = [h.get_text() for h in headings]
            
            #bio = pt.find_bio(content, content_list, headings, heading_indexes)


            #Use find_this function to find edu, exp, leadership, skills, languages, volunteer
            edu, isXml = pt.find_this(soup, ["education", "educaton"], [])
            exp, isXml = pt.find_this(soup, ["experience", "employment", 'career', 'history', 'professional', 'work'], ['objective', 'course'])
            leadExp, x = pt.find_this(soup, ["leadership", 'community', 'extracurricular', 'activities', 'organizations'], [])
            skills, isXml = pt.find_this(soup, ["kills"], [])
            languages, isXml = pt.find_this(soup, ["languages", 'foreign'], ['computer', 'programming'])