示例#1
0
def get_labeled_docs(gold_csv="gold.csv"):
    path = "data/" + gold_csv
    print('File:', path)
    csv_reader = csv.DictReader(open(path, 'rU', encoding="latin-1"),
                                delimiter=',')
    labeled_docs = []

    for line in csv_reader:
        #print('Line:', line)
        instance_id = line['project_id']
        text = line['tweet_text']
        # TODO:  save the Date field
        label = line['label']

        if label in Config.code_book:
            doc = TextInstance(instance_id, text)
            doc.label = label
            #creating the text line that will be passed through tokenize(docs) in Tokenizer.py
            doc.text = text
            labeled_docs.append(doc)

        else:
            print("post: " + instance_id + " label: " + label)

    # adds tokens and pos tags
    labeled_docs = tokenize(labeled_docs)
    return labeled_docs
def clean_text(text):
    # NLP Pre-Processing
    tokenize_words = tokenize(text.lower())
    stop_words_removed = remove_stop_words(tokenize_words)
    relevant_words = [
        WordNetLemmatizer().lemmatize(words, pos='v')
        for words in stop_words_removed if len(words) > 3
    ]
    return relevant_words
示例#3
0
 def __init__(self, path):
     with open(path) as f:
         text = f.read()
     self.tokens = tokenize(text)
     self.count = 0
     self.ClassScope = SymbolTable()
     self.SubroutineScope = SymbolTable()
     self.label_count = {"if": 0, "while": 0}
     self.vmcode = ""
示例#4
0
 def check_filter(self, message: str) -> bool:
     """Returns True if message contains a banned word.
     
     Args:
         message (str): The message to check.
     """
     for word in tokenize(message):
         if word.lower() in self.blacklist:
             return True
     return False
def analyze(root_dir):

    if (len(root_dir) == 0):
        print("No input file name")
        exit()

    tokenized_data, file_names = tokenize(root_dir)

    if (len(tokenized_data) != len(file_names)):
        raise Exception("XML data and file names count mismatch")

    parsed_files = []

    for i in range(len(tokenized_data)):
        # print("\n\n\nParsing file #" + str(i + 1) + " (" + file_names[i] + ")")
        parsed_files.append(parse(tokenized_data[i]))
    
    return parsed_files, file_names
示例#6
0
def find_IOBs(doc):
    IOB_output = []
    # tknzr = TweetTokenizer()
    # tokenize doc.text from docs
    tokens = tokenize(doc)
    #print('THIS IS TOKENS: ', tokens)

    # go over tokens, generating IOB and omitting [ ]
    # status is out, begin or in
    status = 'out'
    for token in tokens:
        #print(token)
        if status == 'out' and token != '[':
            IOB_output.append('O')
            continue
        if status == 'out' and token == '[':
            status = 'begin'
            continue
        if status == 'begin' and token != '/':
            IOB_output.append('B')
            status = 'in'
            continue
        if status == 'begin' and token == '/':
            status = 'cue'
            continue
        if status == 'cue' and token != '~':
            IOB_output.append('C')
            continue
        if status == 'cue' and token == '~':
            status = 'in'
            continue
        if status == 'in' and token != ']' and token != '/':
            IOB_output.append('I')
            continue
        if status == 'in' and token != ']' and token == '/':
            status = 'cue'
            continue
        if status == 'in' and token == ']':
            status = 'out'
    return IOB_output
def constructInvertedIndex():
    global dictionary
    dictionary = BTree(Node("سسسسسس", 1, []))
    nodesList = []
    docCounter = 0
    for news in getNewsList():
        nodes = {}
        position = 0
        for term in tokenize(normalize(news.content), check_finglish):
            if term != invalidToken:
                nodes[dictionary.addOccurrence(term, news.id, position)] = True
            position += 1
        nodesList.append(nodes)
        for node in nodes:
            node.cal_tf(news.id)
        docCounter += 1
        if docCounter % 20 == 0:
            Laws.heap(getDictionary())
    calAllIdf(dictionary.root)

    i = 0
    for news in getNewsList():  # calculate the documents' normalize factors for 3 scoring schemes
        nodes = nodesList[i]
        sum_of_squares_1 = 0
        sum_of_squares_2 = 0
        sum_of_squares_3 = 0
        for node in nodes.keys():
            sum_of_squares_1 += math.pow((getTf(news.id, node.postingsList) - 1) * node.idf, 2)
            sum_of_squares_2 += math.pow(getTf(news.id, node.postingsList), 2)
            sum_of_squares_3 += math.pow(getTf(news.id, node.postingsList) * node.idf, 2)
        normalizationFactorsScheme1.append(math.sqrt(sum_of_squares_1))
        normalizationFactorsScheme2.append(math.sqrt(sum_of_squares_2))
        normalizationFactorsScheme3.append(math.sqrt(sum_of_squares_3))
        i += 1

    Laws.storeHeapDataSet()
    storeDictionary(dictionary)
    storeNormFactors()
示例#8
0
    def message_handler(self, m: Message):
        try:
            if m.type == "366":
                logger.info(f"Successfully joined channel: #{m.channel}")
                # Get the list of mods used for modifying the blacklist
                logger.info("Fetching mod list...")
                self.ws.send_message("/mods")

            elif m.type == "NOTICE":
                # Check whether the NOTICE is a response to our /mods request
                if m.message.startswith("The moderators of this channel are:"):
                    string_list = m.message.replace(
                        "The moderators of this channel are:", "").strip()
                    self.mod_list = [m.channel] + string_list.split(", ")
                    logger.info(
                        f"Fetched mod list. Found {len(self.mod_list) - 1} mods."
                    )
                elif m.message == "There are no moderators of this channel.":
                    self.mod_list = [m.channel]
                    logger.info(f"Fetched mod list. Found no mods.")
                # If it is not, log this NOTICE
                else:
                    logger.info(m.message)

            elif m.type in ("PRIVMSG", "WHISPER"):
                if m.message.startswith(
                        "!enable") and self.check_if_permissions(m):
                    if self._enabled:
                        self.ws.send_whisper(
                            m.user, "The generate command is already enabled.")
                    else:
                        self.ws.send_whisper(
                            m.user,
                            "Users can now use generate command again.")
                        self._enabled = True
                        logger.info(
                            "Users can now use generate command again.")

                elif m.message.startswith(
                        "!disable") and self.check_if_permissions(m):
                    if self._enabled:
                        self.ws.send_whisper(
                            m.user,
                            "Users can now no longer use generate command.")
                        self._enabled = False
                        logger.info(
                            "Users can now no longer use generate command.")
                    else:
                        self.ws.send_whisper(
                            m.user,
                            "The generate command is already disabled.")

                elif m.message.startswith(
                    ("!setcooldown",
                     "!setcd")) and self.check_if_permissions(m):
                    split_message = m.message.split(" ")
                    if len(split_message) == 2:
                        try:
                            cooldown = int(split_message[1])
                        except ValueError:
                            self.ws.send_whisper(
                                m.user,
                                f"The parameter must be an integer amount, eg: !setcd 30"
                            )
                            return
                        self.cooldown = cooldown
                        Settings.update_cooldown(cooldown)
                        self.ws.send_whisper(
                            m.user,
                            f"The !generate cooldown has been set to {cooldown} seconds."
                        )
                    else:
                        self.ws.send_whisper(
                            m.user,
                            f"Please add exactly 1 integer parameter, eg: !setcd 30."
                        )

            if m.type == "PRIVMSG":

                # Ignore bot messages
                if m.user.lower() in self.denied_users:
                    return

                if self.check_if_generate(m.message):
                    if not self.enable_generate_command and not self.check_if_permissions(
                            m):
                        return

                    if not self._enabled:
                        if not self.db.check_whisper_ignore(m.user):
                            self.send_whisper(
                                m.user,
                                "The !generate has been turned off. !nopm to stop me from whispering you."
                            )
                        return

                    cur_time = time.time()
                    if self.prev_message_t + self.cooldown < cur_time or self.check_if_permissions(
                            m):
                        if self.check_filter(m.message):
                            sentence = "You can't make me say that, you madman!"
                        else:
                            params = tokenize(
                                m.message
                            )[2:] if self.allow_generate_params else None
                            # Generate an actual sentence
                            sentence, success = self.generate(params)
                            if success:
                                # Reset cooldown if a message was actually generated
                                self.prev_message_t = time.time()
                        logger.info(sentence)
                        self.ws.send_message(sentence)
                    else:
                        if not self.db.check_whisper_ignore(m.user):
                            self.send_whisper(
                                m.user,
                                f"Cooldown hit: {self.prev_message_t + self.cooldown - cur_time:0.2f} out of {self.cooldown:.0f}s remaining. !nopm to stop these cooldown pm's."
                            )
                        logger.info(
                            f"Cooldown hit with {self.prev_message_t + self.cooldown - cur_time:0.2f}s remaining."
                        )
                    return

                # Send help message when requested.
                elif m.message.startswith(
                    ("!ghelp", "!genhelp", "!generatehelp")):
                    self.send_help_message()

                # Ignore the message if it is deemed a command
                elif self.check_if_other_command(m.message):
                    return

                # Ignore the message if it contains a link.
                elif self.check_link(m.message):
                    return

                if "emotes" in m.tags:
                    # If the list of emotes contains "emotesv2_", then the message contains a bit emote,
                    # and we choose not to learn from those messages.
                    if "emotesv2_" in m.tags["emotes"]:
                        return

                    # Replace modified emotes with normal versions,
                    # as the bot will never have the modified emotes unlocked at the time.
                    for modifier in self.extract_modifiers(m.tags["emotes"]):
                        m.message = m.message.replace(modifier, "")

                # Ignore the message if any word in the sentence is on the ban filter
                if self.check_filter(m.message):
                    logger.warning(
                        f"Sentence contained blacklisted word or phrase:\"{m.message}\""
                    )
                    return

                else:
                    # Try to split up sentences. Requires nltk's 'punkt' resource
                    try:
                        sentences = sent_tokenize(m.message.strip())
                    # If 'punkt' is not downloaded, then download it, and retry
                    except LookupError:
                        logger.debug("Downloading required punkt resource...")
                        import nltk
                        nltk.download('punkt')
                        logger.debug("Downloaded required punkt resource.")
                        sentences = sent_tokenize(m.message.strip())

                    for sentence in sentences:
                        # Get all seperate words
                        words = sentence.split(" ")
                        # Double spaces will lead to invalid rules. We remove empty words here
                        if "" in words:
                            words = [word for word in words if word]

                        # If the sentence is too short, ignore it and move on to the next.
                        if len(words) <= self.key_length:
                            continue

                        # Add a new starting point for a sentence to the <START>
                        #self.db.add_rule(["<START>"] + [words[x] for x in range(self.key_length)])
                        self.db.add_start_queue(
                            [words[x] for x in range(self.key_length)])

                        # Create Key variable which will be used as a key in the Dictionary for the grammar
                        key = list()
                        for word in words:
                            # Set up key for first use
                            if len(key) < self.key_length:
                                key.append(word)
                                continue

                            self.db.add_rule_queue(key + [word])

                            # Remove the first word, and add the current word,
                            # so that the key is correct for the next word.
                            key.pop(0)
                            key.append(word)
                        # Add <END> at the end of the sentence
                        self.db.add_rule_queue(key + ["<END>"])

            elif m.type == "WHISPER":
                # Allow people to whisper the bot to disable or enable whispers.
                if m.message == "!nopm":
                    logger.debug(f"Adding {m.user} to Do Not Whisper.")
                    self.db.add_whisper_ignore(m.user)
                    self.ws.send_whisper(
                        m.user,
                        "You will no longer be sent whispers. Type !yespm to reenable. "
                    )

                elif m.message == "!yespm":
                    logger.debug(f"Removing {m.user} from Do Not Whisper.")
                    self.db.remove_whisper_ignore(m.user)
                    self.ws.send_whisper(
                        m.user,
                        "You will again be sent whispers. Type !nopm to disable again. "
                    )

                # Note that I add my own username to this list to allow me to manage the
                # blacklist in channels of my bot in channels I am not modded in.
                # I may modify this and add a "allowed users" field in the settings file.
                elif m.user.lower() in self.mod_list + ["cubiedev"
                                                        ] + self.allowed_users:
                    # Adding to the blacklist
                    if self.check_if_our_command(m.message, "!blacklist"):
                        if len(m.message.split()) == 2:
                            # TODO: Remove newly blacklisted word from the Database
                            word = m.message.split()[1].lower()
                            self.blacklist.append(word)
                            logger.info(f"Added `{word}` to Blacklist.")
                            self.write_blacklist(self.blacklist)
                            self.ws.send_whisper(m.user,
                                                 "Added word to Blacklist.")
                        else:
                            self.ws.send_whisper(
                                m.user,
                                "Expected Format: `!blacklist word` to add `word` to the blacklist"
                            )

                    # Removing from the blacklist
                    elif self.check_if_our_command(m.message, "!whitelist"):
                        if len(m.message.split()) == 2:
                            word = m.message.split()[1].lower()
                            try:
                                self.blacklist.remove(word)
                                logger.info(
                                    f"Removed `{word}` from Blacklist.")
                                self.write_blacklist(self.blacklist)
                                self.ws.send_whisper(
                                    m.user, "Removed word from Blacklist.")
                            except ValueError:
                                self.ws.send_whisper(
                                    m.user,
                                    "Word was already not in the blacklist.")
                        else:
                            self.ws.send_whisper(
                                m.user,
                                "Expected Format: `!whitelist word` to remove `word` from the blacklist."
                            )

                    # Checking whether a word is in the blacklist
                    elif self.check_if_our_command(m.message, "!check"):
                        if len(m.message.split()) == 2:
                            word = m.message.split()[1].lower()
                            if word in self.blacklist:
                                self.ws.send_whisper(
                                    m.user, "This word is in the Blacklist.")
                            else:
                                self.ws.send_whisper(
                                    m.user,
                                    "This word is not in the Blacklist.")
                        else:
                            self.ws.send_whisper(
                                m.user,
                                "Expected Format: `!check word` to check whether `word` is on the blacklist."
                            )

            elif m.type == "CLEARMSG":
                # If a message is deleted, its contents will be unlearned
                # or rather, the "occurances" attribute of each combinations of words in the sentence
                # is reduced by 5, and deleted if the occurances is now less than 1.
                self.db.unlearn(m.message)

                # TODO: Think of some efficient way to check whether it was our message that got deleted.
                # If the bot's message was deleted, log this as an error
                #if m.user.lower() == self.nick.lower():
                #    logger.error(f"This bot message was deleted: \"{m.message}\"")

        except Exception as e:
            logger.exception(e)
def user_story_processing(user_story):

    existing_comparison_technique = ['cosine', 'euclidean', 'manhattan']

    # NLP Pre-Processing
    tokenize_words = tokenize(user_story)
    corrected_words = spell_checker(tokenize_words)
    stop_words_removed = remove_stop_words(corrected_words)
    hypothesis_synonyms_values = synonyms_words(stop_words_removed)

    lda_output = lda_supervised_topic_modelling(stop_words_removed)

    # Insights from Database
    server_connection = database_processing.mysql_connection(
        'root', 'root', 'localhost')
    databases_present = database_processing.database_information(
        server_connection)
    number_of_values = 1

    database_finalisation_list = []

    vectorized_words = word_embedding_tfidf(databases_present,
                                            hypothesis_synonyms_values)

    for comparison_technique in existing_comparison_technique:
        # Finding the Database to be referred
        if comparison_technique == "euclidean":
            extracted_database_finalised = euclidean_distance(
                databases_present, vectorized_words, number_of_values)
            database_finalisation_list.append(extracted_database_finalised)
        elif comparison_technique == "cosine":
            extracted_database_finalised = cosine_similarity(
                databases_present, vectorized_words, number_of_values)
            database_finalisation_list.append(extracted_database_finalised)
        elif comparison_technique == "manhattan":
            extracted_database_finalised = manhattan_distance(
                databases_present, vectorized_words, number_of_values)
            database_finalisation_list.append(extracted_database_finalised)

    database_finalised_value = processing_array_generated(
        database_finalisation_list, number_of_values)
    database_finalised = database_finalised_value[0]

    while (True):
        user_decision = input(
            "Database Predicted by System is " + database_finalised.upper() +
            ".\nIs the prediction Correct?\nYes - If Prediction is Correct\nNo - If Prediction is Wrong\nNA - Not Aware of Database\nq - To go Back : "
        )
        if user_decision == "Yes":
            break
        elif user_decision == "No":
            print("Following are the list of Database Present:")
            count = 1
            for x in range(0, len(databases_present)):
                print(str(count) + " " + databases_present[x].upper())
                count = count + 1
            database_finalised = input(
                "Enter the Correct Database Name: ").lower()
            break
        elif user_decision == "NA":
            print(
                "All Databases present in the Database Connection will be Considered"
            )
            database_finalised = " "
            break
        elif user_decision == "q":
            return
        else:
            print("Kindly insert input in Yes or No")

    database_metadata_information = []
    database_value = []
    table_information = []
    fields = []
    field_datatype = []
    field_comments = []

    if database_finalised == " ":
        for x in range(0, len(databases_present)):
            database_metadata_info, database_val, table_info, field_info, field_datatype_info, field_comments_info = database_processing.database_metadata_information(
                server_connection, databases_present[x])
            database_metadata_information.extend(database_metadata_info)
            database_value.extend(database_val)
            table_information.extend(table_info)
            fields.extend(field_info)
            field_datatype_info.extend(field_datatype)
            field_comments.extend(field_comments_info)

    else:
        database_metadata_information, database_value, table_information, fields, field_datatype, field_comments = database_processing.database_metadata_information(
            server_connection, database_finalised)

    updated_fields_complete = []

    for field in fields:
        field = re.sub('[^0-9a-zA-Z]+', ' ', field)
        updated_fields_complete.append(field)

    updated_fields = pd.unique(updated_fields_complete).tolist()
    field_comments = pd.unique(field_comments).tolist()

    # Advance NLP Processing
    #relevant_words = [words for words in stop_words_removed if len(words) > 3]
    pos_tagged_words = part_of_speech_tagging(stop_words_removed)
    synonyms_values = synonyms_words(pos_tagged_words)

    if (len(updated_fields) <= pos_tagged_words.size):
        number_of_values = len(updated_fields)
    else:
        number_of_values = pos_tagged_words.size

    # Field Value Processing
    relevant_columns_based_on_comments = []
    relevant_columns_based_on_fields = []

    column_predicted_list = []

    if len(updated_fields):
        vectorized_field_words = word_embedding_tfidf(updated_fields,
                                                      synonyms_values)

        for comparison_technique in existing_comparison_technique:
            # Finding the Database to be referred
            if comparison_technique == "euclidean":
                relevant_columns_based_on_fields = euclidean_distance(
                    updated_fields, vectorized_field_words, number_of_values)
            elif comparison_technique == "cosine":
                relevant_columns_based_on_fields = cosine_similarity(
                    updated_fields, vectorized_field_words, number_of_values)
            elif comparison_technique == "manhattan":
                relevant_columns_based_on_fields = manhattan_distance(
                    updated_fields, vectorized_field_words, number_of_values)

            column_predicted_list.extend(relevant_columns_based_on_fields)

    if (len(field_comments) and len(updated_fields) == len(field_comments)):
        vectorized_comment_words = word_embedding_tfidf(
            field_comments, synonyms_values)

        for comparison_technique in existing_comparison_technique:
            # Finding the Database to be referred
            if comparison_technique == "euclidean":
                relevant_columns_based_on_comments = euclidean_distance(
                    field_comments, vectorized_comment_words, number_of_values)
            elif comparison_technique == "cosine":
                relevant_columns_based_on_comments = cosine_similarity(
                    field_comments, vectorized_comment_words, number_of_values)
            elif comparison_technique == "manhattan":
                relevant_columns_based_on_comments = manhattan_distance(
                    field_comments, vectorized_comment_words, number_of_values)

            relevant_fields_based_on_comments = []

            for comments in relevant_columns_based_on_comments:
                relevant_fields_based_on_comments.append(
                    updated_fields[field_comments.index(comments)])

            column_predicted_list.extend(relevant_fields_based_on_comments)

    number_of_values = len(list(set(column_predicted_list)))
    column_finalised = processing_array_generated(column_predicted_list,
                                                  number_of_values)

    field_finalised = []

    for field_value in column_finalised:
        field_finalised.append(
            fields[updated_fields_complete.index(field_value)])

    finalised_database = []
    finalised_table = []

    for field in field_finalised:
        indices = [i for i, x in enumerate(fields) if x == field]
        field_database = []
        field_table = []
        index = 0
        for z in indices:
            field_database.insert(index, database_value[z].upper())
            field_table.insert(index, table_information[z].upper())
            index = index + 1

        field_database = pd.unique(field_database).tolist()
        field_table = pd.unique(field_table).tolist()
        finalised_database.append(field_database)
        finalised_table.append(field_table)

    print('**** After NLP Processing ****')
    result_display(field_finalised, finalised_table, finalised_database)

    print('**** After Feature Selection ****')
    field_finalised, finalised_table, finalised_database, feature_list, logs, feature_encoded = feature_selection_processing(
        field_finalised, finalised_table, finalised_database,
        server_connection)
    print('**** Logs ****')
    for x in range(len(logs)):
        print(logs[x])
    result_display(field_finalised, finalised_table, finalised_database)

    if (lda_output[0] != " ") and (len(field_finalised) != 0):
        print('**** Probable Algorithms ****')
        algorithm_used, accuracy_score, target_feature, independent_features, message = algorithm_selection_processing(
            feature_list, lda_output, feature_encoded)

        if message == " ":
            table = PrettyTable([
                'Preferences', 'Algorithm Prefered', 'Accuracy Percentage',
                'Target Feature (Field Name__Table Name__Database Name)',
                'Independent Features'
            ])
            index = 1
            for i in range(len(algorithm_used)):
                table.add_row([
                    index, algorithm_used[index - 1],
                    accuracy_score[index - 1], target_feature[index - 1],
                    independent_features[index - 1]
                ])
                index = index + 1

            print(table)
        else:
            print(message)
示例#10
0
    @staticmethod
    def cosine_similarity(text1, text2):
        """
        :param text1, text2: list of tokens
        :return: float
        """
        try:
            tfidf = TfidfVectorizer().fit_transform(
                map(_SimilarityUtil._inverse, [text1, text2]))
            return 1 - cosine(tfidf[0].todense(), tfidf[1].todense())
        except ValueError:
            # Possible containing only stopwords
            return 0


if __name__ == '__main__':
    from Tokenizer import tokenize, tokenize_with_preprocess
    token1 = tokenize('I l\'ove reading')
    token2 = tokenize('I really love look book')
    print _SimilarityUtil.jaccard_similarity(token1, token2)

    token1_pre = tokenize_with_preprocess('I l\'ove reading')
    token2_pre = tokenize_with_preprocess('I really love look book')
    print _SimilarityUtil.jaccard_similarity(token1_pre, token2_pre)

    print _SimilarityUtil.cosine_similarity(token1_pre, token2_pre)
    print _SimilarityUtil.cosine_similarity(['love', 'apple'],
                                            ['love', 'apple'])
    print _SimilarityUtil.cosine_similarity(['a'], ['a'])
示例#11
0
        #self.xml += "<subroutineCall>\n"
        if self.tokens[self.count + 1]["token"] == ".":
            for i in range(3):
                self.writeToken()  # (className|varName) . subroutineName
        else:
            self.writeToken()  # subroutineName
        self.writeToken()  # (
        self.compileExpressionList()
        self.writeToken()  # )
        #self.xml += "</subroutineCall>\n"

    def compileExpressionList(self):
        self.xml += "<expressionList>\n"
        if self.tokens[self.count]["token"] == ")":
            pass
        else:
            self.compileExpression()
            while self.tokens[self.count]["token"] == ",":
                self.writeToken()  # ,
                self.compileExpression()
        self.xml += "</expressionList>\n"


if __name__ == "__main__":
    path = sys.argv[1]
    with open(path) as f:
        text = f.read()
    tokens = tokenize(text)
    parser = ParserXML(tokens)
    parser.writeXML(path[:-5] + ".xml")
示例#12
0
def parse(line):
    tokens = tokenize(line)
    return parseLine(tokens)
示例#13
0
def parse(program):
    return read_from_tokens(tokenize(program))