Пример #1
0
def summarize_secitons(document, sections, coef=0.8):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)
        limit = len(sec_sentences)

        # Ranker
        ranker = SectionMMR(all_sentences)
        ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef)
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
            sent = doc[idx].sentence
            summary.append((sent, sentencs[x][1], doc.get_section_name(idx)))
            summ.append(sent)
        text = ''
        logit("\nSection : " + section_name)
        for sent, score, section in summary:
            text += '\n' + sent.encode('utf-8')
        logit(text)
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join(summ).encode('utf-8'))
Пример #2
0
def getUnwanted(outfile):
    dir = DIR['BASE'] + "demo/"
    os.chdir(dir)
    samples = ''
    total = 0
    num = 12
    samples = ''
    for file in glob("*.xml"):
        try:
            doc = Document(file)
            sentences, offset = doc.all_sentences()
            # Ranker
            ranker = TextRank(sentences)
            ranker.rank()
            scores = sorted(ranker.scores, key=itemgetter(1))
            for x in range(num):
                idx = scores[x][0] + offset
                samples += doc[idx].sentence.encode('utf-8') + '\n'
            total += 1
            print(file + " : Done")
        except Exception as e:
            print(file + str(e))
    # for now this is the location of the file
    writeToFile(outfile, samples, 'w')
    print ("Total number of files processed successfully : " + str(total))
Пример #3
0
def getUnwanted(outfile):
    dir = DIR['BASE'] + "demo/"
    os.chdir(dir)
    samples = ''
    total = 0
    num = 12
    samples = ''
    for file in glob("*.xml"):
        try:
            doc = Document(file)
            sentences, offset = doc.all_sentences()
            # Ranker
            ranker = TextRank(sentences)
            ranker.rank()
            scores = sorted(ranker.scores, key=itemgetter(1))
            for x in range(num):
                idx = scores[x][0] + offset
                samples += doc[idx].sentence.encode('utf-8') + '\n'
            total += 1
            print(file + " : Done")
        except Exception as e:
            print(file + str(e))
    # for now this is the location of the file
    writeToFile(outfile, samples, 'w')
    print("Total number of files processed successfully : " + str(total))
Пример #4
0
def summarize_secitons(document, sections, coef=0.8):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)
        limit = len(sec_sentences)

        # Ranker
        ranker = SectionMMR(all_sentences)
        ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef)
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
            sent = doc[idx].sentence
            summary.append((sent, sentencs[x][1], doc.get_section_name(idx)))
            summ.append(sent)
        text = ""
        logit("\nSection : " + section_name)
        for sent, score, section in summary:
            text += "\n" + sent.encode("utf-8")
        logit(text)
    file = DIR["BASE"] + "data/Summary.txt"
    with open(file, "w") as sfile:
        sfile.write("\n".join(summ).encode("utf-8"))
Пример #5
0
def generateTrainFeatures(client_socket, infile, featurefile):
    #------------------------------------------------
    doc = Document(infile)
    all_sentences, all_offset = doc.all_sentences()
    #------------------------------------------------
    # Positive sentences
    pos_sents, offset = doc.section_sentences('abstract')
    sent_indices = range(offset, offset + len(pos_sents))
    #-----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    #-----------------------------------------
    # Count ranker
    #count_ranker = Ranker(all_sentences, tfidf=False)
    #-----------------------------------------
    for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices,
                                           sec_indices):
        feature_string = '+1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        #feature_string += processTree(tree, count_ranker, sent_idx, True)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    # Negative sentences
    neg_ranker = TextRank(all_sentences)
    neg_ranker.rank()
    num = 5
    x = -1
    neg_sents = []
    sent_indices = []
    while num > 0:
        idx = neg_ranker.scores[x][0] + all_offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            neg_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
    sec_indices = sent2Section(doc, sent_indices)
    #------------------------------------------------
    for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices,
                                           sec_indices):
        feature_string = '-1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        #feature_string += processTree(tree, count_ranker, sent_idx, True)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    print "All input files processed to create feature vectors for training."
Пример #6
0
def get_test_sentences(infile, outfile, backup=False):
    doc = Document(infile)
    sentences, offset = doc.all_sentences()
    ranker = TextRank(sentences)
    ranker.rank()
    num = 7
    x = 0
    samples = ''
    sent_idx = []
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        #if not validSentence(doc[idx]):
        #    continue
        #else:
        #    sent_idx.append(idx)
        #    samples += doc[idx].sentence.encode('utf-8') + '\n'
        #    num -= 1
        sent_idx.append(idx)
        samples += doc[idx].sentence.encode('utf-8') + '\n'
        num -= 1
        #---------------------------------------------------
        # Storing the sentence in the dictionary for pickling for display
        infi = re.match(r'/home/ankur/devbench/scientific/scisumm/demo/(.+)-parscit-section\.xml', infile).group(1)
        key = infi + "-" + str(idx)
        test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'),
                          'textrank': ranker.scores[x - 1][1],
                          'contextpre': getContext(doc, idx, -2),
                          'contextpos': getContext(doc, idx, 2)}
    writeToFile(outfile, samples, 'w')
    #ranker = Ranker(sentences, tfidf=False)
    #return ranker, sent_idx
    #-----------------------------------------
    # Calculating the sectional TF-IDF
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    ranker = Ranker(sections)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding section
    # index
    section_idx = sent2Section(doc, sent_idx)
    if backup:
        backupfile = DIR['BASE'] + "data/backup.txt"
        writeToFile(backupfile, "\n---------" + str(doc) + "---------\n", 'a')
        writeToFile(backupfile, samples, 'a')
    return ranker, section_idx, sent_idx
Пример #7
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    #-----------------------------------------
    sents, sent_indices = getSecRankedSent(doc)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding
    # section index
    sec_indices = sent2Section(doc, sent_indices)
    summary = []
    classified = []
    sum_len = 0
    for sent, sec_idx in zip(sents, sec_indices):
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(getDepParse(client_socket, sent))
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
            classified.append((sent, sent_val))
    for sent, val in sorted(classified, key=itemgetter(1)):
        summary.append(sent)
        sum_len += len(sent.split(' '))
        if sum_len > 130:
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
Пример #8
0
def summarize(document, all=True):
    doc = Document(document)
    sentences, offset = (doc.all_sentences()
                         if all else doc.filtered_sentences())

    # Ranker
    ranker = TextRank(sentences)
    ranker.rank()
    scores = ranker.scores

    # Selector
    summary = []
    sum_len = 0
    for x in range(num):
        idx = scores[x][0] + offset
        sent = doc[idx].sentence
        if sum_len + len(sent.split(' ')) > MAXLEN:
            break
        summary.append((sent, scores[x][1], doc.get_section_name(idx)))
        sum_len += len(sent.split(' '))
    text = ''
    logit("\nP10-1024")
    logit("\nAll Sentences" if all else "\nFiltered Sentences")
    logit("Length of summary : " + str(sum_len))
    for sent, score, section in summary:
        text += '\n' + "[" + section.encode('utf-8') + "] " + \
                sent.encode('utf-8')
        #"[" + str(score) + "] " + sent.encode('utf-8')
    logit(text)

    # Printer
    # this has to be automated
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join([sent
                               for sent, sc, sec in summary]).encode('utf-8'))

    # Evaluator
    guess_summary_list = [file]
    ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]]
    recall, precision, F_measure = PythonROUGE(guess_summary_list,
                                               ref_summary_list,
                                               ngram_order=1)
    logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision,
                                                      F_measure))
Пример #9
0
def summarize(document, all=True):
    doc = Document(document)
    sentences, offset = (doc.all_sentences() if all
                         else doc.filtered_sentences())

    # Ranker
    ranker = TextRank(sentences)
    ranker.rank()
    scores = ranker.scores

    # Selector
    summary = []
    sum_len = 0
    for x in range(num):
        idx = scores[x][0] + offset
        sent = doc[idx].sentence
        if sum_len + len(sent.split(' ')) > MAXLEN:
            break
        summary.append((sent, scores[x][1], doc.get_section_name(idx)))
        sum_len += len(sent.split(' '))
    text = ''
    logit("\nP10-1024")
    logit("\nAll Sentences" if all else "\nFiltered Sentences")
    logit("Length of summary : " + str(sum_len))
    for sent, score, section in summary:
        text += '\n' + "[" + section.encode('utf-8') + "] " + \
                sent.encode('utf-8')
                #"[" + str(score) + "] " + sent.encode('utf-8')
    logit(text)

    # Printer
    # this has to be automated
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join([sent for sent, sc, sec in summary]).
                    encode('utf-8'))

    # Evaluator
    guess_summary_list = [file]
    ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]]
    recall, precision, F_measure = PythonROUGE(guess_summary_list,
                                               ref_summary_list,
                                               ngram_order=1)
    logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision,
                                                      F_measure))
Пример #10
0
def generateTestFeatures(infile):
    doc = Document(infile)
    #------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1)
    #------------------------------------------------
    all_sentences, all_offset = doc.all_sentences()
    ranker = TextRank(all_sentences)
    ranker.rank()
    num = 7
    x = 0
    test_sents = []
    sent_indices = []
    while num > 0:
        idx = ranker.scores[x][0] + all_offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            test_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
        #------------------------------------------------
        # For display and analysis
        key = fcode + '-' + str(idx)
        test_data[key] = {
            'sentence': doc[idx].sentence.encode('utf-8'),
            'textrank': ranker.scores[x - 1][1],
            'contextpre': getContext(doc, idx, -2),
            'contextpos': getContext(doc, idx, 2)
        }
    #-----------------------------------------
    for sentence, sent_idx in zip(test_sents, sent_indices):
        key = fcode + '-' + str(sent_idx)
        print key
        print test_data[key]['contextpre']
        print "----Main sentence Start----"
        print test_data[key]['sentence']
        print "----Main sentence End----"
        print test_data[key]['contextpos']
        feature_string = raw_input()
        feature_string += '1'
        test_data[key]['reallbl'] = feature_string
Пример #11
0
def generateTestFeatures(infile):
    doc = Document(infile)
    #------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1)
    #------------------------------------------------
    all_sentences, all_offset = doc.all_sentences()
    ranker = TextRank(all_sentences)
    ranker.rank()
    num = 7
    x = 0
    test_sents = []
    sent_indices = []
    while num > 0:
        idx = ranker.scores[x][0] + all_offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            test_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
        #------------------------------------------------
        # For display and analysis
        key = fcode + '-' + str(idx)
        test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'),
                          'textrank': ranker.scores[x - 1][1],
                          'contextpre': getContext(doc, idx, -2),
                          'contextpos': getContext(doc, idx, 2)}
    #-----------------------------------------
    for sentence, sent_idx in zip(test_sents, sent_indices):
        key = fcode + '-' + str(sent_idx)
        print key
        print test_data[key]['contextpre']
        print "----Main sentence Start----"
        print test_data[key]['sentence']
        print "----Main sentence End----"
        print test_data[key]['contextpos']
        feature_string = raw_input()
        feature_string += '1'
        test_data[key]['reallbl'] = feature_string
Пример #12
0
def get_neg_sentences(infile, outfile, backup=False):
    doc = Document(infile)
    sentences, offset = doc.all_sentences()
    ranker = TextRank(sentences)
    ranker.rank()
    num = 5
    x = -1
    samples = ''
    sent_idx = []
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_idx.append(idx)
            samples += doc[idx].sentence.encode('utf-8') + '\n'
            num -= 1
    writeToFile(outfile, samples, 'w')
    #ranker = Ranker(sentences, tfidf=False)
    #return ranker, sent_idx
    #-----------------------------------------
    # Calculating the sectional TF-IDF
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    ranker = Ranker(sections)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding section
    # index
    section_idx = sent2Section(doc, sent_idx)
    if backup:
        backupfile = DIR['BASE'] + "data/backup.txt"
        writeToFile(backupfile, "\n---------Negative---------\n", 'a')
        writeToFile(backupfile, samples, 'a')
    return ranker, section_idx
Пример #13
0
def summarize_secitons(document, sections):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)

        # Ranker
        ranker = TextRank(sec_sentences)
        ranker.rank()
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
            sent = doc[idx].sentence
            summary.append((sent, sentencs[x][1], doc.get_section_name(idx)))
            summ.append(sent)
        text = ''
        logit("\nSection : " + section_name)
        for sent, score, section in summary:
            text += '\n' + sent.encode('utf-8')
        logit(text)
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join(summ).encode('utf-8'))

    # Evaluator
    guess_summary_list = [file]
    ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]]
    recall, precision, F_measure = PythonROUGE(guess_summary_list,
                                               ref_summary_list,
                                               ngram_order=1)
    logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision,
                                                      F_measure))
Пример #14
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    looper = 20
    num = 10
    x = 0
    summary = []
    sent_idx = [0]
    sum_len = 0
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        elif doc.get_section_name(idx) == 'abstract':
            continue
        sent_idx[0] = idx
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(getDepParse(client_socket,
                                      doc[idx].sentence.encode('utf-8')))
        #-----------------------------------------
        # The sent_idx needs to be converted to reflect the corresponding
        # section index
        sec_idx = sent2Section(doc, sent_idx)
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx[0], False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
        if sent_val > 0:
            summary.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
            sum_len += len(doc[idx].sentence.encode('utf-8').split(' '))
        if sum_len > 130:
            break
        looper -= 1
        if looper == 0:
            print "Looper Done"
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
Пример #15
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    looper = 20
    num = 10
    x = 0
    summary = []
    sent_idx = [0]
    sum_len = 0
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        elif doc.get_section_name(idx) == 'abstract':
            continue
        sent_idx[0] = idx
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(
            getDepParse(client_socket, doc[idx].sentence.encode('utf-8')))
        #-----------------------------------------
        # The sent_idx needs to be converted to reflect the corresponding
        # section index
        sec_idx = sent2Section(doc, sent_idx)
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx[0], False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
        if sent_val > 0:
            summary.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
            sum_len += len(doc[idx].sentence.encode('utf-8').split(' '))
        if sum_len > 130:
            break
        looper -= 1
        if looper == 0:
            print "Looper Done"
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
Пример #16
0
def generateTrainFeatures(client_socket, infile, featurefile):
    #------------------------------------------------
    doc = Document(infile)
    all_sentences, all_offset = doc.all_sentences()
    #------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1)
    #------------------------------------------------
    #------------------------------------------------
    # Positive sentences
    pos_sents, offset = doc.section_sentences('abstract')
    sent_indices = range(offset, offset + len(pos_sents))
    #-----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    #-----------------------------------------
    # Count ranker
    #count_ranker = Ranker(all_sentences, tfidf=False)
    #-----------------------------------------
    for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices,
                                           sec_indices):
        key = fcode + '-' + str(sent_idx)
        feature_string = '+1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, 1, False)
        train_data[key] = {'sentence': doc[sent_idx].sentence.encode('utf-8'),
                           'reallbl': '+1',
                           'features': feature_string}
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    # Negative sentences
    neg_ranker = TextRank(all_sentences)
    neg_ranker.rank()
    num = 5
    x = -1
    neg_sents = []
    sent_indices = []
    while num > 0:
        idx = neg_ranker.scores[x][0] + all_offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            neg_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
    sec_indices = sent2Section(doc, sent_indices)
    #------------------------------------------------
    for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices,
                                           sec_indices):
        key = fcode + '-' + str(sent_idx)
        feature_string = '-1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, 1, False)
        train_data[key] = {'sentence': doc[sent_idx].sentence.encode('utf-8'),
                           'reallbl': '-1',
                           'features': feature_string}
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    print "All input files processed to create feature vectors for training."