def create_data_set(three_classes, i):
    f = []
    doc = read_input(i)
    sentences = [sent.string.strip() for sent in doc.sents]

    print(sentences)

    for s in sentences:
        features = {}
        sen = nlp(s)
        features["text"] = s
        features["pos"] = pos.get_pos_tags(sen)
        features["dep"] = dependency.get_dep_tags(sen)
        features["ner"] = ner.get_ner(sen)
        features["closing_question_mark"] = sentence_closes_with_question_mark.sentence_closes_with_question_marks(sen)  # bool
        features["contains_adverb"] = contains_adverb.contains_adverb(sen)  # bool
        features["contains_modal"] = contains_modal_verb.contains_modal_verb(sen)  # bool
        features["first_person"] = reference_to_first_person.contains_first_person(sen)  # bool
        features["argumentative_discourse_markers"] = argumentative_discourse_markers.contains_argumentative_markers(sen)
        f.append(features)
        # print(features)

    # print(f)

    dataset = create_dataframe(f)

    return dataset
def create_data_set(i):
    f = []

    f_in = open("../Corpus/%s.txt" % i, "r")
    text = f_in.read()

    doc = nlp(text)

    sentences = [sent.string.strip() for sent in doc.sents]

    for s in sentences:
        print(s)
        features = {}
        sen = nlp(s)

        features["text"] = s
        #features["pos"] = pos.get_pos_tags(sen)
        #features["dep"] = dependency.get_dep_tags(sen)
        #features["ner"] = ner.get_ner(sen)
        features[
            "closing_question_mark"] = sentence_closes_with_question_mark.sentence_closes_with_question_marks(
                sen)  # bool
        features["contains_adverb"] = contains_adverb.contains_adverb(
            sen)  # bool
        features["contains_modal"] = contains_modal_verb.contains_modal_verb(
            sen)  # bool
        features[
            "first_person"] = reference_to_first_person.contains_first_person(
                sen)  # bool
        #features["causal_markers"] = argumentative_discourse_markers.contains_causal_markers(sen) # bool
        #features["conditional_markers"] = argumentative_discourse_markers.contains_conditional_markers(sen) # bool
        #features["adversative_markers"] = argumentative_discourse_markers.contains_adversative_markers(sen) # bool
        #features["consecutive_markers"] = argumentative_discourse_markers.contains_consecutive_markers(sen) # bool
        #features["concessive_markers"] = argumentative_discourse_markers.contains_concessive_markers(sen) # bool
        features[
            "argumentative_discourse_markers"] = argumentative_discourse_markers.contains_argumentative_markers(
                sen)
        features["contains_named_entities"] = ner.contains_ner(sen)
        features["sentence_length"] = sentence_length.get_sentence_length(sen)
        f.append(features)
        #print(features)

    #print(f)

    df = pd.DataFrame.from_dict(f)

    X1 = cv.transform(df.text)
    # print(X1.toarray())
    # print(cv.get_feature_names())
    df = df.drop(columns='text')
    count_vect_df = pd.DataFrame(X1.todense(), columns=cv.get_feature_names())
    # print(pd.concat([df, count_vect_df], axis=1))

    combined_df = pd.concat([df, count_vect_df], axis=1)

    return combined_df
Пример #3
0
def create_data_set(text, cv):
    f = []

    # read text
    #f_in = open("../Corpus/%s.txt" % i, "r")
    #text = f_in.read()

    doc = nlp(text)

    sentences = [sent.string.strip() for sent in doc.sents]

    # features
    for s in sentences:
        #print(s)
        features = {}
        sen = nlp(s)

        features["text"] = s
        features[
            "closing_question_mark"] = sentence_closes_with_question_mark.sentence_closes_with_question_marks(
                sen)  # bool
        features["contains_adverb"] = contains_adverb.contains_adverb(
            sen)  # bool
        features["contains_modal"] = contains_modal_verb.contains_modal_verb(
            sen)  # bool
        features[
            "first_person"] = reference_to_first_person.contains_first_person(
                sen)  # bool
        features[
            "argumentative_discourse_markers"] = argumentative_discourse_markers.contains_argumentative_markers(
                sen)
        features["contains_named_entities"] = ner.contains_ner(sen)
        features["sentence_length"] = sentence_length.get_sentence_length(sen)
        f.append(features)

    df = pd.DataFrame.from_dict(f)

    X1 = cv.transform(df.text)
    df = df.drop(columns='text')
    count_vect_df = pd.DataFrame(X1.todense(), columns=cv.get_feature_names())

    combined_df = pd.concat([df, count_vect_df], axis=1)

    return combined_df
Пример #4
0
def create_data_set(three_classes):
    f = []
    for i in range(0, 990):
        f_in = open("../Corpus/%s.txt" % i, "r")
        text = f_in.read()

        f_ann = open("../Corpus/%s.ann" % i, "r")
        line = f_ann.readline()
        annotations = []
        while line:
            l = line.split("\t")
            if l[0].startswith("T"):
                annotation = l[2]
                label = l[1].split(" ")[0]

                if not three_classes:
                    if label in ('Claim', 'Premise'):
                        label = 'Argumentative'
                annotations.append((annotation.strip(), label))
            line = f_ann.readline()

        #for a in annotations:
        #print(a)

        doc = nlp(text)

        sentences = [sent.string.strip() for sent in doc.sents]

        for s in sentences:
            features = {}
            sen = nlp(s)
            features["label"] = get_label(s, annotations)
            features["text"] = s
            #features["pos"] = pos.get_pos_tags(sen)
            #features["dep"] = dependency.get_dep_tags(sen)
            #features["ner"] = ner.get_ner(sen)
            features[
                "closing_question_mark"] = sentence_closes_with_question_mark.sentence_closes_with_question_marks(
                    sen)  # bool
            features["contains_adverb"] = contains_adverb.contains_adverb(
                sen)  # bool
            features[
                "contains_modal"] = contains_modal_verb.contains_modal_verb(
                    sen)  # bool
            features[
                "first_person"] = reference_to_first_person.contains_first_person(
                    sen)  # bool
            #features["causal_markers"] = argumentative_discourse_markers.contains_causal_markers(sen) # bool
            #features["conditional_markers"] = argumentative_discourse_markers.contains_conditional_markers(sen) # bool
            #features["adversative_markers"] = argumentative_discourse_markers.contains_adversative_markers(sen) # bool
            #features["consecutive_markers"] = argumentative_discourse_markers.contains_consecutive_markers(sen) # bool
            #features["concessive_markers"] = argumentative_discourse_markers.contains_concessive_markers(sen) # bool
            features[
                "argumentative_discourse_markers"] = argumentative_discourse_markers.contains_argumentative_markers(
                    sen)
            features["contains_named_entities"] = ner.contains_ner(sen)
            features["sentence_length"] = sentence_length.get_sentence_length(
                sen)
            f.append(features)
            #print(features)

    #print(f)

    dataset = create_dataframe(f)

    return dataset