Пример #1
0
def auto_align_list(filesSource,
                    filesTarget,
                    lfAlignerSh='../LF_aligner_3.11.sh',
                    lang1='de',
                    lang2='de'):
    if len(filesSource) != len(filesTarget):
        raise ValueError('Numbers of source and target files are not equal')

    for kk in range(len(filesSource)):
        Aligner.call_lf_aligner_auto(filesSource[kk], filesTarget[kk],
                                     lfAlignerSh, lang1, lang2)
Пример #2
0
def auto_align_list_webcgi(filesSource,
                           filesTarget,
                           lfAlignerSh='./LF_aligner_3.11.sh',
                           lang1='de',
                           lang2='de'):
    """This copy of function is to cope with the web version using cgi-bin
    Note the change of path to LF_aligner
    """
    if len(filesSource) != len(filesTarget):
        raise ValueError('Numbers of source and target files are not equal')

    for kk in range(len(filesSource)):
        Aligner.call_lf_aligner_auto(filesSource[kk], filesTarget[kk],
                                     lfAlignerSh, lang1, lang2)
Пример #3
0
def alignerSpark(dict,genome, hashDF, sc, dict_map):
    k = 10
    for i in dict_map.keys():
        print ("• Allineamento sequenza n°", i)
        reDF = Seeds.Sparkseeds(dict, i, k, hashDF, sc)
        reDF = reDF.withColumn('ex', F.explode('POS_GEN'))
        reDF = reDF.withColumn('Flag', F.when((F.col('ex') < dict_map[i][1]) & (F.col("ex") > dict_map[i][0] ), 1).otherwise(0))
        reDF = reDF.filter(reDF.Flag == 1).select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.POS_GEN, reDF.Flag)
        if reDF.count() >= 3:
        #     # print("0 per Allineamento locale")
        #     # print("1 per Allineamento globale")
        #     # scelta = int(input("Scelta tipologia di allineamento: "))
            seedArray = [x["POS_SEQ"] for x in reDF.rdd.collect()]
        #     print("SeedArray finale:", seedArray)
            PG = [x["POS_GEN"] for x in reDF.rdd.collect()]
            optloc = None
            df,min_percentage = best_choice(dict, i, PG, seedArray, genome,sc)
            Gen = [x["GEN"] for x in df.rdd.collect()]
            for gen in Gen:
                D, B = Aligner.createB(dict[i], gen)
                if ((100-min_percentage[0])<60.0):
                #if scelta == 0:
                    A,optloc = Aligner.local_align(dict[i], gen, Aligner.ScoreParam())  # Smith-Waterman
                    bt = Aligner.backtrack(B, optloc, A)
                else:
                    M = Aligner.affine_align(dict[i], gen, Aligner.ScoreParam())  # Needleman-Wunsch
                    bt = Aligner.backtrack(B, optloc, M)
                aligned_word_1, aligned_word_2, operations, line = Aligner.align(gen, dict[i], bt)
                print("Lunghezza sequenze: ", len(dict[i]), "| Numero operazioni: ", len(operations))
                alignment_table = [aligned_word_1, line, operations, line, aligned_word_2]
                print(tb.tabulate(alignment_table, tablefmt="orgtbl"))
                print()
        else:
            print()
Пример #4
0
def MakeTwoLangCorpus(alignedBooksPath, corpusPath):
    alignedBooks = []
    for alignedBookPath in [
            os.path.join(alignedBooksPath, x)
            for x in os.listdir(alignedBooksPath)
    ]:
        alignedBook = Aligner.AlignedMultiText()
        alignedBook.LoadFromFile(alignedBookPath)
        alignedBooks.append(alignedBook)

    corpus = Aligner.MakeAlignedCorpus(alignedBooks)

    corpus.SaveToFile(corpusPath)

    return None
Пример #5
0
def get_weight(training_set):
    weights = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    ]
    features, weights = Aligner.align(training_set.p_str_tokens,
                                      training_set.h_str_tokens, weights)
    print weights
Пример #6
0
 def Activated(self):
     # do something here...
     #import kicadStepUptools
     #reload_lib( kicadStepUptools )
     import Caliper
     reload_lib(Caliper)
     Caliper.Cp_undock()
     Caliper.Cp_centerOnScreen(Caliper.CPDockWidget)
     import Mover
     reload_lib(Mover)
     Mover.Mv_undock()
     Mover.Mv_centerOnScreen(Mover.MVDockWidget)
     import Aligner
     reload_lib(Aligner)
     Aligner.Alg_undock()
     Aligner.Alg_centerOnScreen (Aligner.ALGDockWidget)
Пример #7
0
 def setUp(self):
     self.p = "The current documentation emphasizes that Flask is best suited to smaller projects"
     self.h = "is flask good for large apps"
     self.p_str_tokens = word_tokenize(self.p)
     self.h_str_tokens = word_tokenize(self.h)
     self.weights = 'default'
     self.aligner = Aligner.Aligner()
     self.target = 4
Пример #8
0
 def setUp(self):
     self.p = "he ate the hats"
     self.h = "he ate the pears"
     self.p_str_tokens = word_tokenize(self.p)
     self.h_str_tokens = word_tokenize(self.h)
     self.weights = 'default'
     self.aligner = Aligner.Aligner()
     self.target = 6
Пример #9
0
 def setUp(self):
     self.p = ''
     self.h = ''
     self.p_str_tokens = word_tokenize(self.p)
     self.h_str_tokens = word_tokenize(self.h)
     self.weights = 'default'
     self.aligner = Aligner.Aligner()
     self.target = 6
Пример #10
0
 def setUp(self):
     self.p = "for scripts nginx can also serve as a very capable software load balancer"
     self.h = "can Nginx be a load balancer?"
     self.p_str_tokens = word_tokenize(self.p)
     self.h_str_tokens = word_tokenize(self.h)
     self.weights = 'default'
     self.aligner = Aligner.Aligner()
     self.target = 1
Пример #11
0
 def setUp(self):
     self.p = "However Flask is just not designed for large applications"
     self.h = "is flask good for large apps"
     self.p_str_tokens = word_tokenize(self.p)
     self.h_str_tokens = word_tokenize(self.h)
     self.weights = 'default'
     self.aligner = Aligner.Aligner()
     self.target = 4
Пример #12
0
def main():
    #print(len(sys.argv))
    if len(sys.argv) == 7 and sys.argv[1] == '-MakeDictionary':
        firstLang = sys.argv[2]
        firstLangBiblePath = sys.argv[3]
        secondLang = sys.argv[4]
        secondLangBiblePath = sys.argv[5]
        dictionariesPath = sys.argv[6]
        MakeDictionary(firstLang, firstLangBiblePath, secondLang,
                       secondLangBiblePath, dictionariesPath)
    if len(sys.argv) == 3 and sys.argv[1] == '-DownloadLibrary':
        path = sys.argv[2]
        DownloadLibrary(path)
    if len(sys.argv) == 7 and sys.argv[1] == '-AlignBooks':
        libraryPath = sys.argv[2]
        firstLang = sys.argv[3]
        secondLang = sys.argv[4]
        dictionariesPath = sys.argv[5]
        alignedBooksPath = sys.argv[6]
        AlignBooks(libraryPath, firstLang, secondLang, dictionariesPath,
                   alignedBooksPath)
    if len(sys.argv) == 4 and sys.argv[1] == '-MakeTwoLangCorpus':
        alignedBooksPath = sys.argv[2]
        corpusPath = sys.argv[3]
        MakeTwoLangCorpus(alignedBooksPath, corpusPath)
    if len(sys.argv) == 1:
        dictionary1 = Dictionary.Dictionary()
        dictionary1.LoadFromFile(
            "C:\\Users\\Bober\\Desktop\\ботва\\8\\Мат СК\\Dictionaries_2\\EN-RU.xml"
        )
        dictionary2 = Dictionary.Dictionary()
        dictionary2.LoadFromFile(
            "C:\\Users\\Bober\\Desktop\\ботва\\8\\Мат СК\\Dictionaries_2\\RU-EN.xml"
        )
        aligner = Aligner.Aligner("EN", "RU", dictionary1, dictionary2)

        enSentence1 = "The emperor of Lilliput, attended by several of the nobility, comes to see the author in his confinement"
        ruSentence1 = "Император Лилипутии в сопровождении многочисленных вельмож приходит навестить автора в его заключении"
        print('-----Computing alignment value fo sentences:')
        print('1. ', enSentence1)
        print('2. ', ruSentence1)
        value1 = aligner._getSencencesAlignmentValue(enSentence1, ruSentence1)

        print(value1)

        enSentence2 = "I took them all in my right hand, put five of them into my coat-pocket; and as to the sixth, I made a countenance as if I would eat him alive"
        ruSentence2 = "Его императорское величество часто обращался ко мне с вопросами, на которые я отвечал ему, но ни он, ни я не понимали ни слова из того, что говорили друг другу"
        print('-----Computing alignment value fo sentences:')
        print('1. ', enSentence2)
        print('2. ', ruSentence2)
        value2 = aligner._getSencencesAlignmentValue(enSentence2, ruSentence2)

        print(value2)
Пример #13
0
 def setUp(self):
     #self.p = """
     #Highland Park native was overwhelmed by prospect of prison from charges
     #that he stole MIT articles electronically.
     #"""
     #self.h = "the highland park native was overwhelmed."
     self.p = "the cat ate the tasty pizza"
     self.h = 'the cat never ate'
     self.p_str_tokens = word_tokenize(self.p)
     self.h_str_tokens = word_tokenize(self.h)
     self.weights = 'default'
     self.aligner = Aligner.Aligner()
     self.target = 4
Пример #14
0
 def setUp(self):
     self.p = """
     Highland Park native was overwhelmed by prospect of prison from charges
     that he stole MIT articles electronically.
     """
     self.h = "the highland park native was overwhelmed."
     #self.p = "I ate a pizza."
     #self.h = 'I ate food.'
     self.p_str_tokens = word_tokenize(self.p)
     self.h_str_tokens = word_tokenize(self.h)
     self.weights = 'default'
     self.aligner = Aligner.Aligner()
     self.target = 1
Пример #15
0
    def setUp(self):

        self.answer = {
            0: 'Yes',
            1: 'Yes',
            2: 'No',
            3: 'No',
            4: 'No',
            5: 'No',
            6: 'No'
        }

        self.p = "Sanger reports that the Stuxnet virus was developed first under President Bush in 2006 under the"
        self.h = "was stuxnet created under President Bush"
        self.p_str_tokens = word_tokenize(self.p)
        self.h_str_tokens = word_tokenize(self.h)
        self.weights = 'default'
        self.aligner = Aligner.Aligner()
        self.target = 1
Пример #16
0
    def setUp(self):

        self.answer = {
            0: 'Yes',
            1: 'Yes',
            2: 'No',
            3: 'No',
            4: 'No',
            5: 'No',
            6: 'No'
        }

        self.p = "jj Abrams is directing Star Wars: Episode vii"
        self.h = "is jj abrams directing star wars vii"
        self.p_str_tokens = word_tokenize(self.p)
        self.h_str_tokens = word_tokenize(self.h)
        self.weights = 'default'
        self.aligner = Aligner.Aligner()
        self.target = 1
Пример #17
0
    def setUp(self):

        self.answer = {
            0: 'Yes',
            1: 'Yes',
            2: 'No',
            3: 'No',
            4: 'No',
            5: 'No',
            6: 'No'
        }

        self.p = "Coca Cola Drink who invented Coca Cola?"
        self.h = "who invented Coca Cola?"
        self.p_str_tokens = word_tokenize(self.p)
        self.h_str_tokens = word_tokenize(self.h)
        self.weights = 'default'
        self.aligner = Aligner.Aligner()
        self.target = 1
Пример #18
0
    def setUp(self):

        self.answer = {
            0: 'Yes',
            1: 'Yes',
            2: 'No',
            3: 'No',
            4: 'No',
            5: 'No',
            6: 'No'
        }

        self.p = "David Gilmour is a guitarist and vocalist with British rock band Pink Floyd, and was voted No."
        self.h = "was david gilmour the guitarist for pink floyd"
        self.p_str_tokens = word_tokenize(self.p)
        self.h_str_tokens = word_tokenize(self.h)
        self.weights = 'default'
        self.aligner = Aligner.Aligner()
        self.target = 1
Пример #19
0
    def setUp(self):

        self.answer = {
            0: 'Yes',
            1: 'Yes',
            2: 'No',
            3: 'No',
            4: 'No',
            5: 'No',
            6: 'No'
        }


        self.p = "There's no official Google API for the text to speech"
        self.h = "does google have a text to speech API"
        self.p_str_tokens = word_tokenize(self.p)
        self.h_str_tokens = word_tokenize(self.h)
        self.weights = 'default'
        self.aligner = Aligner.Aligner()
        self.target = 3
Пример #20
0
    def setUp(self):

        self.answer = {
            0: 'Yes',
            1: 'Yes',
            2: 'No',
            3: 'No',
            4: 'No',
            5: 'No',
            6: 'No'
        }


        self.p = "When the paramedic inspected him, he realized that Ariel Sharon was having a stroke"
        self.h = "did Ariel Sharon have a stroke?"
        self.p_str_tokens = word_tokenize(self.p)
        self.h_str_tokens = word_tokenize(self.h)
        self.weights = 'default'
        self.aligner = Aligner.Aligner()
        self.target = 1
Пример #21
0
def learn_weights(training_set, learning_epochs, burn_in_epochs, learning_rate,
                  learning_rate_multiplier):
    weights = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    ]
    weights_history = []

    for i in range(learning_epochs):
        print '*** Starting epoch %s ***' % i
        learning_rate *= learning_rate_multiplier
        logging.warning('Starting epoch %s with learning rate %s' %
                        (i, learning_rate))
        shuffle(training_set)

        for index, problem in enumerate(training_set):
            print '* Starting problem %s of %s in epoch %s*' % (
                index, len(training_set), i)
            gold_features = gold_featurizer.featurize(problem)
            logging.warning('\nStarting weights:\n%s' % weights)
            #logging.warning('Problem:\n%s\n%s' % (problem.p_str_tokens,
            #problem.h_str_tokens))
            #logging.warning('\nGold features:\n%s' % (gold_features))

            predicted_alignment, predicted_features = Aligner.align(
                problem.p_str_tokens, problem.h_str_tokens, weights)
            #logging.warning('\nPredicted features\n:%s' % predicted_features)

            weights = weights + (learning_rate *
                                 (gold_features - predicted_features))
            #diff =  gold_features - predicted_features
            #logging.warning('\nUnrated weights difference:\n%s' % diff)
            logging.warning('Summed rated weights:\n%s' % weights)

        weights = weights / sqrt(sum([i**2 for i in weights]))
        logging.warning('L2 normalization:\n%s' % weights)
        weights_history.append(weights)
        logging.warning('\n\nWeights history:\n%s' % weights_history)

    weights_averaged = 1 / (learning_epochs - burn_in_epochs) * sum(
        weights_history[burn_in_epochs:])
    return weights_averaged
Пример #22
0
    def setUp(self):

        self.answer = {
            0: 'Yes',
            1: 'Yes',
            2: 'No',
            3: 'No',
            4: 'No',
            5: 'No',
            6: 'No'
        }

        #self.p = "Marissa Ann Mayer became the CEO of Yahoo on July 17, ... She was previously the vice president of geographic and local services at the search engine company Google."
        #self.p = "Marissa Mayer, one of the top executives at Google, will be the next chief of Yahoo"
        self.p = "Sergey Mikhaylovich Brin (born August 21, 1973) is a Russian -born American computer scientist and Internet entrepreneur who, with Larry Page, co-founded Google, one "
        self.h = "did Sergey Brin co-found google?"
        self.p_str_tokens = word_tokenize(self.p)
        self.h_str_tokens = word_tokenize(self.h)
        self.weights = 'default'
        self.aligner = Aligner.Aligner()
        self.target = 1
Пример #23
0
def AlignBooks(libraryPath, firstLang, secondLang, dictionariesPath,
               alignedBooksPath):
    if os.path.exists(alignedBooksPath): shutil.rmtree(alignedBooksPath)
    os.makedirs(alignedBooksPath)

    library = Library.Library()
    library.LoadFromFile(os.path.join(libraryPath, 'library.xml'))

    parallelBooksToAlign = []
    for parallelBook in library.ParallelBooks:
        hasFirstLangBook = len([
            b for b in parallelBook.Books
            if b.Language == firstLang and b.LocalFilePath != None
        ]) == 1
        hasSecondLangBook = len([
            b for b in parallelBook.Books
            if b.Language == secondLang and b.LocalFilePath != None
        ]) == 1
        if hasFirstLangBook and hasSecondLangBook:
            parallelBooksToAlign.append(parallelBook)

    dictionary1 = Dictionary.Dictionary()
    dictionary1.LoadFromFile(
        os.path.join(dictionariesPath, firstLang + '-' + secondLang + '.xml'))
    dictionary2 = Dictionary.Dictionary()
    dictionary2.LoadFromFile(
        os.path.join(dictionariesPath, secondLang + '-' + firstLang + '.xml'))

    aligner = Aligner.Aligner(firstLang, secondLang, dictionary1, dictionary2)
    for i, parallelBook in enumerate(parallelBooksToAlign):
        print(i)
        alignedBookPath = os.path.join(alignedBooksPath, str(i) + '.xml')
        firstBook = ([
            b for b in parallelBook.Books if b.Language == firstLang
        ])[0]
        secondBook = ([
            b for b in parallelBook.Books if b.Language == secondLang
        ])[0]
        alignedBook = aligner.AlignBooks(firstBook, secondBook)
        alignedBook.SaveToFile(alignedBookPath)
Пример #24
0
def learn_weights(training_set, learning_epochs, burn_in_epochs,
learning_rate, learning_rate_multiplier):
    weights = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    weights_history = []

    for i in range(learning_epochs):
        print '*** Starting epoch %s ***' % i
        learning_rate *= learning_rate_multiplier
        logging.warning('Starting epoch %s with learning rate %s' %
        (i, learning_rate))
        shuffle(training_set)

        for index, problem in enumerate(training_set):
            print '* Starting problem %s of %s in epoch %s*' % (index, len(training_set), i)
            gold_features = gold_featurizer.featurize(problem)
            logging.warning('\nStarting weights:\n%s' % weights)
            #logging.warning('Problem:\n%s\n%s' % (problem.p_str_tokens,
            #problem.h_str_tokens))
            #logging.warning('\nGold features:\n%s' % (gold_features))

            predicted_alignment, predicted_features = Aligner.align(
                    problem.p_str_tokens, problem.h_str_tokens, weights)
            #logging.warning('\nPredicted features\n:%s' % predicted_features)

            weights = weights + (learning_rate *
            (gold_features - predicted_features))
            #diff =  gold_features - predicted_features
            #logging.warning('\nUnrated weights difference:\n%s' % diff)
            logging.warning('Summed rated weights:\n%s' % weights)

        weights = weights / sqrt(sum([i ** 2 for i in weights]))
        logging.warning('L2 normalization:\n%s' % weights)
        weights_history.append(weights)
        logging.warning('\n\nWeights history:\n%s' % weights_history)

    weights_averaged = 1 / (learning_epochs
    - burn_in_epochs) * sum(weights_history[burn_in_epochs:])
    return weights_averaged
Пример #25
0
    def setUp(self):

        self.answer = {
            0: 'Yes',
            1: 'Yes',
            2: 'No',
            3: 'No',
            4: 'No',
            5: 'No',
            6: 'No'
        }

        #self.p = "Butler did not defeat Marquette"
        #self.h = "Did Marquette lose to Butler?"
        self.p = "The Communist Party USA was a small Maoist political party which was founded in 1965 by members of the Communist Party around Michael Laski who took the side of China in the Sino-Soviet split."
        #self.h = "the first president of the United States"
        self.h = "Michael Laski was an opponent of China."
        self.p_str_tokens = word_tokenize(self.p)
        self.h_str_tokens = word_tokenize(self.h)
        self.weights = 'default'
        self.aligner = Aligner.Aligner()
        self.target = 6
Пример #26
0
        lines = []
        for (seq, sp) in seq2sp_dict.items():
            if seq not in sequenceTodiscard:
                line = "%s:%s\n" % (sp, seq)
                lines.append(line)
        with open(FinalSp2Seq, "w") as sp2seqFile:
            sp2seqFile.write("".join(lines))

        if not args.realign_ali:
            filteredfasta.write_fasta(FinalAli)
        else:
            AfterfilteringFasta = TmpAli
            filteredfasta.write_fasta(AfterfilteringFasta)
            ### Realign the final alignment
            MafftProcess = Aligner.Mafft(TmpAli)
            #MafftProcess.Maxiterate = 2 # too long
            MafftProcess.AutoOption = True
            MafftProcess.QuietOption = True
            MafftProcess.OutputFile = FinalAli

            if os.path.isfile(TmpAli):
                logger.info("Realign the filtered alignment")
                _ = MafftProcess.launch()
                StartingAlignment = MafftProcess.OutputFile
            else:
                logger.error("%s is not a file.", TmpAli)
                end(1)

        ### Built a tree with the final alignment
        logger.info("Built a tree with the final alignment")
Пример #27
0
def main():
    arg_parser = argparse.ArgumentParser(
        description="Brandeis transition-based AMR parser 1.0")
    build_opts(arg_parser)

    args = arg_parser.parse_args()

    amr_file = args.amr_file
    instances = None
    train_instance = None
    constants.FLAG_COREF = args.coref
    constants.FLAG_PROP = args.prop
    constants.FLAG_RNE = args.rne
    constants.FLAG_VERB = args.verblist
    constants.FLAG_ONTO = args.onto
    constants.FLAG_DEPPARSER = args.depparser

    if args.mode == 'preprocess':
        # using corenlp to preprocess the sentences
        do_preproces(args)

    elif args.mode == 'test_gold_graph':
        # preprocess the JAMR aligned amr
        do_test_gold_graph(args)

    elif args.mode == 'align':
        # do alignment
        if args.input_file:
            instances = pickle.load(open(args.input_file, 'rb'))
        else:
            raise ValueError(
                "Missing data file! specify it using --input or using preprocessing!"
            )
        gold_instances_file = args.input_file.split('.')[0] + '_gold.p'

        print >> log, "Doing alignment..."

        if LOGGED:
            saveerr = sys.stderr
            sys.stderr = open('./log/alignment.log', 'w')

        amr_aligner = Aligner(verbose=args.verbose)
        ref_graphs = []
        begin = args.begin
        counter = 1
        for i in range(len(instances)):
            snt = instances[i].text
            amr = instances[i].amr
            if args.verbose > 1:
                print >> log, counter
                print >> log, "Sentence:"
                print >> log, snt + '\n'
                print >> log, "AMR:"
                print >> log, amr.to_amr_string()

            alresult = amr_aligner.apply_align(snt, amr)
            ref_amr_graph = SpanGraph.init_ref_graph(amr, alresult)
            instances[i].addGoldGraph(ref_amr_graph)
            if args.verbose > 1:
                print >> log, amr_aligner.print_align_result(alresult, amr)
            counter += 1

        pickle.dump(instances, open(gold_instances_file, 'wb'),
                    pickle.HIGHEST_PROTOCOL)
        if LOGGED:
            sys.stderr.close()
            sys.stderr = saveerr
        print >> log, "Done alignment and gold graph generation."
        sys.exit()

    elif args.mode == 'userGuide':
        # test user guide actions
        print 'Read in training instances...'
        train_instances = preprocess(amr_file, False)

        sentID = int(raw_input("Input the sent ID:"))
        amr_parser = Parser()
        amr_parser.testUserGuide(train_instances[sentID])

        sys.exit()

    elif args.mode == 'oracleGuide':
        # test deterministic oracle
        train_instances = preprocess(amr_file,
                                     start_corenlp=False,
                                     input_format=args.amrfmt,
                                     prp_format=args.prpfmt)
        try:
            hand_alignments = load_hand_alignments(amr_file +
                                                   str('.hand_aligned'))
        except IOError:
            hand_alignments = []

        start_step = args.start_step
        begin = args.begin
        amr_parser = Parser(oracle_type=DET_T2G_ORACLE_ABT,
                            verbose=args.verbose)
        #ref_graphs = pickle.load(open('./data/ref_graph.p','rb'))
        n_correct_total = .0
        n_parsed_total = .0
        n_gold_total = .0
        pseudo_gold_amr = []
        n_correct_tag_total = .0
        n_parsed_tag_total = 0.
        n_gold_tag_total = .0

        gold_amr = []
        aligned_instances = []
        for instance in train_instances[begin:]:

            if hand_alignments and instance.comment[
                    'id'] not in hand_alignments:
                continue
            state = amr_parser.testOracleGuide(instance, start_step)
            n_correct_arc, n1, n_parsed_arc, n_gold_arc, n_correct_tag, n_parsed_tag, n_gold_tag = state.evaluate(
            )
            if n_correct_arc != n1:
                import pdb
                pdb.set_trace()
            n_correct_total += n_correct_arc
            n_parsed_total += n_parsed_arc
            n_gold_total += n_gold_arc
            p = n_correct_arc / n_parsed_arc if n_parsed_arc else .0
            r = n_correct_arc / n_gold_arc if n_gold_arc else .0
            indicator = 'PROBLEM!' if p < 0.5 else ''
            if args.verbose > 2:
                print >> sys.stderr, "Precision: %s Recall: %s  %s\n" % (
                    p, r, indicator)
            n_correct_tag_total += n_correct_tag
            n_parsed_tag_total += n_parsed_tag
            n_gold_tag_total += n_gold_tag
            p1 = n_correct_tag / n_parsed_tag if n_parsed_tag else .0
            r1 = n_correct_tag / n_gold_tag if n_gold_tag else .0
            if args.verbose > 2:
                print >> sys.stderr, "Tagging Precision:%s Recall:%s" % (p1,
                                                                         r1)

            instance.comment['alignments'] +=\
                ''.join(' %s-%s|%s' % (idx-1, idx, instance.amr.get_pid(state.A.abt_node_table[idx]))
                        for idx in state.A.abt_node_table if isinstance(idx,int))

            aligned_instances.append(instance)
            pseudo_gold_amr.append(GraphState.get_parsed_amr(state.A))
        pt = n_correct_total / n_parsed_total if n_parsed_total != .0 else .0
        rt = n_correct_total / n_gold_total if n_gold_total != .0 else .0
        ft = 2 * pt * rt / (pt + rt) if pt + rt != .0 else .0
        write_parsed_amr(pseudo_gold_amr, aligned_instances, amr_file,
                         'pseudo-gold', hand_alignments)
        print "Total Accuracy: %s, Recall: %s, F-1: %s" % (pt, rt, ft)

        tp = n_correct_tag_total / n_parsed_tag_total if n_parsed_tag_total != .0 else .0
        tr = n_correct_tag_total / n_gold_tag_total if n_gold_tag_total != .0 else .0
        print "Tagging Precision:%s Recall:%s" % (tp, tr)

    elif args.mode == 'train':
        do_train(args)

    elif args.mode == 'parse':
        # actual parsing
        test_instances = preprocess(amr_file,
                                    start_corenlp=False,
                                    input_format=args.amrfmt,
                                    prp_format=args.prpfmt)
        if args.section != 'all':
            print "Choosing corpus section: %s" % (args.section)
            tcr = constants.get_corpus_range(args.section, 'test')
            test_instances = test_instances[tcr[0]:tcr[1]]

        #random.shuffle(test_instances)
        print >> experiment_log, "Loading model: ", args.model
        model = Model.load_model(args.model)
        parser = Parser(model=model,
                        oracle_type=DET_T2G_ORACLE_ABT,
                        action_type=args.actionset,
                        verbose=args.verbose,
                        elog=experiment_log)
        print >> experiment_log, "BEGIN PARSING"
        span_graph_pairs, results = parser.parse_corpus_test(test_instances)
        parsed_suffix = '%s.%s.parsed' % (args.section,
                                          args.model.split('.')[-2])
        write_parsed_amr(results,
                         test_instances,
                         amr_file,
                         suffix=parsed_suffix)

        print >> experiment_log, "DONE PARSING"
        if args.smatcheval:
            smatch_path = "./smatch_2.0.2/smatch.py"
            python_path = 'python'
            options = '--pr -f'
            parsed_filename = amr_file + '.' + parsed_suffix
            command = '%s %s %s %s %s' % (python_path, smatch_path, options,
                                          parsed_filename, amr_file)

            print 'Evaluation using command: ' + (command)
            print subprocess.check_output(command,
                                          stderr=subprocess.STDOUT,
                                          shell=True)

    elif args.mode == 'eval':
        '''break down error analysis'''
        # TODO: here use pickled file, replace it with parsed AMR and gold AMR
        span_graph_pairs = pickle.load(open(args.eval[0], 'rb'))
        instances = pickle.load(open(args.eval[1], 'rb'))

        amr_parser = Parser(oracle_type=DET_T2G_ORACLE_ABT,
                            verbose=args.verbose)
        error_stat = defaultdict(
            lambda: defaultdict(lambda: defaultdict(list)))
        for spg_pair, instance in zip(span_graph_pairs, instances):
            amr_parser.errorAnalyze(spg_pair[0], spg_pair[1], instance,
                                    error_stat)

    else:
        arg_parser.print_help()
Пример #28
0
    command = ["cp", In, Out]

    logger.debug(" ".join(command))
    p = subprocess.Popen(command,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    (out, err) = p.communicate()
    if err:
        logger.error(err)

    return (out, err)


if args.realign_ali:
    ### Realign the input alignment
    InitialMafftProcess = Aligner.Mafft(StartingAlignment)
    InitialMafftProcess.Maxiterate = 2
    InitialMafftProcess.QuietOption = True
    InitialMafftProcess.OutputFile = "%s/%s.fa" % (TmpDirName, "RealignAli")

    if os.path.isfile(StartingAlignment):
        logger.info("Realign the input alignment")
        _ = InitialMafftProcess.launch()
        StartingAlignment = InitialMafftProcess.OutputFile
    else:
        logger.error("%s is not a file.", StartingAlignment)
        end(1)

### Concate all  sp2seq files
logger.info("Concate all Sp2Seq files")
Sp2Seq = "%s/StartingSp2Seq.txt" % (TmpDirName)
Пример #29
0
# n_seqs = int(sys.argv[1]) if len(sys.argv) >= 2 else 2
# n_seqs = min(n_seqs, len(seqs))

# n = int(ceil(log(n_seqs,2)))

# for i in range(1,n+1):
#     for j in range(0, 1<<n, 1<<i):
#         if j+(1<<(i-1)) >= n_seqs:
#             break

#         print 'aligning',j,j+(1<<(i-1))

#         align = Aligner(seqs[ j ],seqs[ j+(1<<(i-1)) ])
#         align.align()

align = Aligner(seqs[0], seqs[1])
align.align()
data += seqs[0].graphData(arrows=True, vertical=True)
print '01' * 50
print seqs[0]
print '01' * 50

align = Aligner(seqs[2], seqs[3])
align.align()
data += seqs[2].graphData(arrows=True, vertical=True)
print '23' * 50
print seqs[2]
print '23' * 50

align = Aligner(seqs[0], seqs[2])
align.align()
Пример #30
0
raw_seqs = []
seqs = []
while len(seqs) <= n or n == -1:
    name, seq = args.infile.readline(), args.infile.readline()
    if not name or not seq:
        break
    raw_seqs.append({'seq': seq[:-1], 'name': name[1:-1]})
    seqs.append(Graph(seq=seq[:-1], name=name[1:-1]))

if n == -1:
    n = len(seqs)

for i in range(1, n):
    if args.verbosity >= 1:
        print 'aligning', 0, i
    align = Aligner(seqs[0], seqs[i])
    align.align()

if args.verbosity >= 5:
    print seqs[0]

# lg = int(ceil(log(n,2)))
# for i in range(1,lg+1):
#     for j in range(0, 1<<lg, 1<<i):
#         if j+(1<<(i-1)) >= n:
#             break

#         print 'aligning',j,j+(1<<(i-1))

#         align = Aligner(seqs[j], seqs[ j+(1<<(i-1)) ])
#         align.align()
Пример #31
0
def get_weight(training_set):
    weights = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    features, weights = Aligner.align(training_set.p_str_tokens,
        training_set.h_str_tokens, weights)
    print weights
Пример #32
0
def main():
    '''
    usage = "Usage:%prog [options] amr_file"
    opt = OptionParser(usage=usage)
    opt.add_option("-v",action="store",dest="verbose",type='int',
                   default=0,help="set up verbose level")
    opt.add_option("-a",action="store_true",dest="align",
                   default=False,help="do alignment between sentence and amr")
    opt.add_option("-b",action="store",dest="begin",type='int',
                   default=0,help="for debugging"
                   "When do alignment, where the alignment begins"
                   "When test oracle, where to begin")
    opt.add_option("-s",action="store",dest="start_step",type='int',
                   default=0,help="where the step begins,for testing oracle")
    opt.add_option("-o",action="store",dest="sentfilep",
                   help="output sentences to file and parse the sentence into dependency graph")
    opt.add_option("-i",action="store",dest="parsedfilep",
                   help="read parsed dependency graph from file")
    opt.add_option("-g",action="store",dest="userActfile",
                   help="read user input action sequences as guide")
    opt.add_option("-d",action="store",dest="oracle",type='int',default=0,\
                   help="test the output actions of deterministic oracle: "
                         "1: tree oracle 2: list-based oracle")
    '''
    arg_parser = argparse.ArgumentParser(description="Brandeis transition-based AMR parser 1.0")
    
    arg_parser.add_argument('-v','--verbose',type=int,default=0,help='set up verbose level for debug')
    arg_parser.add_argument('-b','--begin',type=int,default=0,help='specify which sentence to begin the alignment or oracle testing for debug')
    arg_parser.add_argument('-s','--start_step',type=int,default=0,help='specify which step to begin oracle testing;for debug')
    #arg_parser.add_argument('-i','--input_file',help='the input: preprocessed data instances file for aligner or training')
    arg_parser.add_argument('-d','--dev',help='development file')
    arg_parser.add_argument('-as','--actionset',choices=['basic'],default='basic',help='choose different action set')
    arg_parser.add_argument('-m','--mode',choices=['preprocess','test_gold_graph','align','userGuide','oracleGuide','train','parse','eval'],help="preprocess:generate pos tag, dependency tree, ner\n" "align:do alignment between AMR graph and sentence string")
    arg_parser.add_argument('-dp','--depparser',choices=['stanford','stanfordConvert','stdconv+charniak','clear','mate','turbo'],default='stdconv+charniak',help='choose the dependency parser')
    arg_parser.add_argument('--coref',action='store_true',help='flag to enable coreference information')
    arg_parser.add_argument('--prop',action='store_true',help='flag to enable semantic role labeling information')
    arg_parser.add_argument('--model',help='specify the model file')
    arg_parser.add_argument('--feat',help='feature template file')
    arg_parser.add_argument('-iter','--iterations',default=1,type=int,help='training iterations')
    arg_parser.add_argument('amr_file',nargs='?',help='amr annotation file/input sentence file for parsing')
    arg_parser.add_argument('--amrfmt',action='store_true',help='specifying the input file is AMR annotation file')
    arg_parser.add_argument('-e','--eval',nargs=2,help='Error Analysis: give parsed AMR file and gold AMR file')
    arg_parser.add_argument('--section',choices=['proxy','all'],default='all',help='choose section of the corpus. Only works for LDC2014T12 dataset.')

    args = arg_parser.parse_args()

    amr_file = args.amr_file
    instances = None
    train_instance = None
    constants.FLAG_COREF=args.coref
    constants.FLAG_PROP=args.prop
    constants.FLAG_DEPPARSER=args.depparser

    # using corenlp to preprocess the sentences 
    if args.mode == 'preprocess':
        instances = preprocess(amr_file,START_SNLP=True,INPUT_AMR=args.amrfmt)
        print "Done preprocessing!"
    # preprocess the JAMR aligned amr
    elif args.mode == 'test_gold_graph':     
        instances = preprocess(amr_file,False)
        #instances = pickle.load(open('data/gold_edge_graph.pkl','rb'))
        gold_amr = []
        for inst in instances:
            GraphState.sent = inst.tokens
            gold_amr.append(GraphState.get_parsed_amr(inst.gold_graph))
        #pseudo_gold_amr = [GraphState.get_parsed_amr(inst.gold_graph) for inst in instances]
        write_parsed_amr(gold_amr,instances,amr_file,'abt.gold')
        #instances = preprocess_aligned(amr_file)
        print "Done output AMR!"
    # do alignment
    elif args.mode == 'align':

        if args.input_file:
            instances = pickle.load(open(args.input_file,'rb'))
        else:
            raise ValueError("Missing data file! specify it using --input or using preprocessing!")
        gold_instances_file = args.input_file.split('.')[0]+'_gold.p'

        print >> log, "Doing alignment..."

        if LOGGED:
            saveerr = sys.stderr
            sys.stderr = open('./log/alignment.log','w')

        amr_aligner = Aligner(verbose=args.verbose)
        ref_graphs = []
        begin = args.begin 
        counter = 1
        #for snt, amr in zip(snts[begin:],amrs[begin:]):
        for i in range(len(instances)):
            snt = instances[i].text
            amr = instances[i].amr
            if args.verbose > 1:
                print >> log, counter
                print >> log, "Sentence:"
                print >> log, snt+'\n'
                
                print >> log, "AMR:"                
                print >> log, amr.to_amr_string()

            alresult = amr_aligner.apply_align(snt,amr)
            ref_amr_graph = SpanGraph.init_ref_graph(amr,alresult)
            #ref_graphs.append(ref_amr_graph)
            instances[i].addGoldGraph(ref_amr_graph)
            if args.verbose > 1:
                #print >> log, "Reference tuples:"
                #print >> log, ref_depGraph.print_tuples()
                print >> log, amr_aligner.print_align_result(alresult,amr)
                #raw_input('ENTER to continue')
            counter += 1

        pickle.dump(instances,open(gold_instances_file,'wb'),pickle.HIGHEST_PROTOCOL)
        #pickle.dump(ref_graphs,open('./data/ref_graph.p','wb'),pickle.HIGHEST_PROTOCOL)
        if LOGGED:
            sys.stderr.close() 
            sys.stderr = saveerr
        print >> log, "Done alignment and gold graph generation."
        sys.exit()
        
    # test user guide actions
    elif args.mode == 'userGuide':
        print 'Read in training instances...'
        train_instances = preprocess(amr_file,False)

        sentID = int(raw_input("Input the sent ID:"))
        amr_parser = Parser()
        amr_parser.testUserGuide(train_instances[sentID])

        sys.exit()

    # test deterministic oracle 
    elif args.mode == 'oracleGuide':
        
        train_instances = preprocess(amr_file,START_SNLP=False)
        try:
            hand_alignments = load_hand_alignments(amr_file+str('.hand_aligned'))
        except IOError:
            hand_alignments = []


        start_step = args.start_step
        begin = args.begin
        amr_parser = Parser(oracle_type=DET_T2G_ORACLE_ABT,verbose=args.verbose)
        #ref_graphs = pickle.load(open('./data/ref_graph.p','rb'))
        n_correct_total = .0
        n_parsed_total = .0
        n_gold_total = .0
        pseudo_gold_amr = []
        n_correct_tag_total = .0
        n_parsed_tag_total = 0.
        n_gold_tag_total = .0

        
        gold_amr = []
        aligned_instances = []
        #print "shuffling training instances"
        #random.shuffle(train_instances)
        for instance in train_instances[begin:]:
            
            if hand_alignments and instance.comment['id'] not in hand_alignments: continue
            state = amr_parser.testOracleGuide(instance,start_step)
            n_correct_arc,n1,n_parsed_arc, n_gold_arc,n_correct_tag,n_parsed_tag,n_gold_tag = state.evaluate()
            #assert n_correct_arc == n1
            if n_correct_arc != n1:
                import pdb
                pdb.set_trace()
            n_correct_total += n_correct_arc
            n_parsed_total += n_parsed_arc
            n_gold_total += n_gold_arc
            p = n_correct_arc/n_parsed_arc if n_parsed_arc else .0
            r = n_correct_arc/n_gold_arc if n_parsed_arc else .0
            indicator = 'PROBLEM!' if p < 0.5 else ''
            if args.verbose > 2: print >> sys.stderr, "Precision: %s Recall: %s  %s\n" % (p,r,indicator)
            n_correct_tag_total +=  n_correct_tag
            n_parsed_tag_total +=  n_parsed_tag
            n_gold_tag_total += n_gold_tag
            p1 = n_correct_tag/n_parsed_tag if n_parsed_tag else .0
            r1 = n_correct_tag/n_gold_tag if n_parsed_tag else .0
            if args.verbose > 2: print >> sys.stderr,"Tagging Precision:%s Recall:%s" % (p1,r1)

            instance.comment['alignments'] += ''.join(' %s-%s|%s'%(idx-1,idx,instance.amr.get_pid(state.A.abt_node_table[idx])) for idx in state.A.abt_node_table if isinstance(idx,int))

            aligned_instances.append(instance)
            pseudo_gold_amr.append(GraphState.get_parsed_amr(state.A))
            #gold_amr.append(instance.amr)
            #assert set(state.A.tuples()) == set(instance.gold_graph.tuples())
        pt = n_correct_total/n_parsed_total if n_parsed_total != .0 else .0
        rt = n_correct_total/n_gold_total if n_gold_total !=.0 else .0
        ft = 2*pt*rt/(pt+rt) if pt+rt != .0 else .0
        write_parsed_amr(pseudo_gold_amr,aligned_instances,amr_file,'pseudo-gold',hand_alignments)
        print "Total Accuracy: %s, Recall: %s, F-1: %s" % (pt,rt,ft)

        tp = n_correct_tag_total/n_parsed_tag_total if n_parsed_tag_total != .0 else .0
        tr = n_correct_tag_total/n_gold_tag_total if n_gold_tag_total != .0 else .0
        print "Tagging Precision:%s Recall:%s" % (tp,tr)

        #amr_parser.record_actions('data/action_set.txt')
    elif args.mode == 'train': # training
        print "Parser Config:"
        print "Incorporate Coref Information: %s"%(constants.FLAG_COREF)
        print "Incorporate SRL Information: %s"%(constants.FLAG_PROP)
        print "Dependency parser used: %s"%(constants.FLAG_DEPPARSER)
        train_instances = preprocess(amr_file,START_SNLP=False)        
        if args.dev: dev_instances = preprocess(args.dev,START_SNLP=False)


        if args.section != 'all':
            print "Choosing corpus section: %s"%(args.section)
            tcr = constants.get_corpus_range(args.section,'train')
            train_instances = train_instances[tcr[0]:tcr[1]]
            if args.dev:
                dcr = constants.get_corpus_range(args.section,'dev')
                dev_instances = dev_instances[dcr[0]:dcr[1]]

        
        feat_template = args.feat if args.feat else None
        model = Model(elog=experiment_log)
        #model.output_feature_generator()
        parser = Parser(model=model,oracle_type=DET_T2G_ORACLE_ABT,action_type=args.actionset,verbose=args.verbose,elog=experiment_log)
        model.setup(action_type=args.actionset,instances=train_instances,parser=parser,feature_templates_file=feat_template)
        
        print >> experiment_log, "BEGIN TRAINING!"
        for iter in xrange(1,args.iterations+1):
            print >> experiment_log, "shuffling training instances"
            random.shuffle(train_instances)
            
            print >> experiment_log, "Iteration:",iter
            begin_updates = parser.perceptron.get_num_updates()
            parser.parse_corpus_train(train_instances)
            parser.perceptron.average_weight()
            #model.save_model(args.model+'-iter'+str(iter)+'-'+str(int(time.time()))+'.m')
            model.save_model(args.model+'-iter'+str(iter)+'.m')
            if args.dev:
                print >> experiment_log ,"Result on develop set:"                
                _,parsed_amr = parser.parse_corpus_test(dev_instances)
                write_parsed_amr(parsed_amr,dev_instances,args.dev,args.section+'.'+str(iter)+'.parsed')

        print >> experiment_log ,"DONE TRAINING!"
        
    elif args.mode == 'parse': # actual parsing
        test_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=False)
        if args.section != 'all':
            print "Choosing corpus section: %s"%(args.section)
            tcr = constants.get_corpus_range(args.section,'test')
            test_instances = test_instances[tcr[0]:tcr[1]]
            
        #random.shuffle(test_instances)
        print >> experiment_log, "Loading model: ", args.model 
        model = Model.load_model(args.model)
        parser = Parser(model=model,oracle_type=DET_T2G_ORACLE_ABT,action_type=args.actionset,verbose=args.verbose,elog=experiment_log)
        print >> experiment_log ,"BEGIN PARSING"
        span_graph_pairs,results = parser.parse_corpus_test(test_instances)
        write_parsed_amr(results,test_instances,amr_file,suffix='%s.parsed'%(args.section))
        #write_span_graph(span_graph_pairs,test_instances,amr_file,suffix='spg.50')
        ################
        # for eval     #
        ################
        #pickle.dump(span_graph_pairs,open('data/eval/%s_spg_pair.pkl'%(amr_file),'wb'),pickle.HIGHEST_PROTOCOL)
        #pickle.dump(test_instances,open('data/eval/%s_instances.pkl'%(amr_file),'wb'),pickle.HIGHEST_PROTOCOL)
        print >> experiment_log ,"DONE PARSING"
        
        #plt.hist(results)
        #plt.savefig('result.png')

    elif args.mode == 'eval':
        '''break down error analysis'''
        # TODO: here use pickled file, replace it with parsed AMR and gold AMR
        span_graph_pairs = pickle.load(open(args.eval[0],'rb'))
        instances = pickle.load(open(args.eval[1],'rb'))
        
        amr_parser = Parser(oracle_type=DET_T2G_ORACLE_ABT,verbose=args.verbose)
        error_stat = defaultdict(lambda:defaultdict(lambda:defaultdict(list)))
        for spg_pair,instance in zip(span_graph_pairs,instances):
            amr_parser.errorAnalyze(spg_pair[0],spg_pair[1],instance,error_stat)

    else:
        arg_parser.print_help()
def main():
    '''
    usage = "Usage:%prog [options] amr_file"
    opt = OptionParser(usage=usage)
    opt.add_option("-v",action="store",dest="verbose",type='int',
                   default=0,help="set up verbose level")
    opt.add_option("-a",action="store_true",dest="align",
                   default=False,help="do alignment between sentence and amr")
    opt.add_option("-b",action="store",dest="begin",type='int',
                   default=0,help="for debugging"
                   "When do alignment, where the alignment begins"
                   "When test oracle, where to begin")
    opt.add_option("-s",action="store",dest="start_step",type='int',
                   default=0,help="where the step begins,for testing oracle")
    opt.add_option("-o",action="store",dest="sentfilep",
                   help="output sentences to file and parse the sentence into dependency graph")
    opt.add_option("-i",action="store",dest="parsedfilep",
                   help="read parsed dependency graph from file")
    opt.add_option("-g",action="store",dest="userActfile",
                   help="read user input action sequences as guide")
    opt.add_option("-d",action="store",dest="oracle",type='int',default=0,\
                   help="test the output actions of deterministic oracle: "
                         "1: tree oracle 2: list-based oracle")
    '''
    arg_parser = argparse.ArgumentParser(description="Brandeis transition-based AMR parser 1.0")
    
    arg_parser.add_argument('-v','--verbose',type=int,default=0,help='set up verbose level for debug')
    arg_parser.add_argument('-b','--begin',type=int,default=0,help='specify which sentence to begin the alignment or oracle testing for debug')
    arg_parser.add_argument('-s','--start_step',type=int,default=0,help='specify which step to begin oracle testing for debug')
    #arg_parser.add_argument('-i','--input_file',help='the input: preprocessed data instances file for aligner or training')
    arg_parser.add_argument('-d','--dev',help='development file')
    arg_parser.add_argument('-as','--actionset',choices=['basic'],default='basic',help='choose different action set')
    arg_parser.add_argument('-m','--mode',choices=['preprocess','test_gold_graph','align','userGuide','oracleGuide','train','parse'],help="preprocess:generate pos tag, dependency tree, ner\n" "align:do alignment between AMR graph and sentence")
    arg_parser.add_argument('-dp','--depparser',choices=['stanford','turbo','mate','malt','stdconv+charniak'],default='stanford',help='choose the dependency parser, default:{stanford}')
    arg_parser.add_argument('--model',help='specify the model file')
    arg_parser.add_argument('--feat',help='feature template file')
    arg_parser.add_argument('-iter','--iterations',type=int,help='training iterations')
    arg_parser.add_argument('amr_file',nargs='?',help='amr bank file for preprocessing')
    

    args = arg_parser.parse_args()

    amr_file = args.amr_file
    instances = None
    train_instance = None

    constants.FLAG_DEPPARSER=args.depparser

    # using corenlp to preprocess the sentences 
    if args.mode == 'preprocess':
        instances = preprocess(amr_file)
        print >> experiment_log, "Done preprocessing!"
    # preprocess the JAMR aligned amr
    elif args.mode == 'test_gold_graph':     
        instances = preprocess(amr_file,False)
        #instances = pickle.load(open('data/gold_edge_graph.pkl','rb'))
        pseudo_gold_amr = []
        for inst in instances:
            GraphState.sent = inst.tokens
            pseudo_gold_amr.append(GraphState.get_parsed_amr(inst.gold_graph))
        #pseudo_gold_amr = [GraphState.get_parsed_amr(inst.gold_graph) for inst in instances]
        write_parsed_amr(pseudo_gold_amr,instances,amr_file,'gold')
        #instances = preprocess_aligned(amr_file)
        print "Done output AMR!"
    # do alignment
    elif args.mode == 'align':

        if args.input_file:
            instances = pickle.load(open(args.input_file,'rb'))
        else:
            raise ValueError("Missing data file! specify it using --input or using preprocessing!")
        gold_instances_file = args.input_file.split('.')[0]+'_gold.p'

        print >> log, "Doing alignment..."

        if LOGGED:
            saveerr = sys.stderr
            sys.stderr = open('./log/alignment.log','w')

        amr_aligner = Aligner(verbose=args.verbose)
        ref_graphs = []
        begin = args.begin 
        counter = 1
        #for snt, amr in zip(snts[begin:],amrs[begin:]):
        for i in range(len(instances)):
            snt = instances[i].text
            amr = instances[i].amr
            if args.verbose > 1:
                print >> log, counter
                print >> log, "Sentence:"
                print >> log, snt+'\n'
                
                print >> log, "AMR:"                
                print >> log, amr.to_amr_string()

            alresult = amr_aligner.apply_align(snt,amr)
            ref_amr_graph = SpanGraph.init_ref_graph(amr,alresult)
            #ref_graphs.append(ref_amr_graph)
            instances[i].addGoldGraph(ref_amr_graph)
            if args.verbose > 1:
                #print >> log, "Reference tuples:"
                #print >> log, ref_depGraph.print_tuples()
                print >> log, amr_aligner.print_align_result(alresult,amr)
                #raw_input('ENTER to continue')
            counter += 1

        pickle.dump(instances,open(gold_instances_file,'wb'),pickle.HIGHEST_PROTOCOL)
        #pickle.dump(ref_graphs,open('./data/ref_graph.p','wb'),pickle.HIGHEST_PROTOCOL)
        if LOGGED:
            sys.stderr.close() 
            sys.stderr = saveerr
        print >> log, "Done alignment and gold graph generation."
        sys.exit()
        
    # test user guide actions
    elif args.mode == 'userGuide':
        print 'Read in training instances...'
        train_instances = preprocess(amr_file,False)

        sentID = int(raw_input("Input the sent ID:"))
        amr_parser = Parser()
        amr_parser.testUserGuide(train_instances[sentID])

        sys.exit()

    # test deterministic oracle 
    elif args.mode == 'oracleGuide':
        
        train_instances = preprocess(amr_file,False)

        start_step = args.start_step
        begin = args.begin
        amr_parser = Parser(oracle_type=DETERMINE_TREE_TO_GRAPH_ORACLE_SC,verbose=args.verbose)
        #ref_graphs = pickle.load(open('./data/ref_graph.p','rb'))
        n_correct_total = .0
        n_parsed_total = .0
        n_gold_total = .0
        pseudo_gold_amr = []
        for instance in train_instances[begin:]:
            state = amr_parser.testOracleGuide(instance,start_step)
            n_correct_arc,n1,n_parsed_arc, n_gold_arc,_,_,_ = state.evaluate()
            assert n_correct_arc == n1
            n_correct_total += n_correct_arc
            n_parsed_total += n_parsed_arc
            n_gold_total += n_gold_arc
            p = n_correct_arc/n_parsed_arc if n_parsed_arc else .0
            indicator = 'PROBLEM!' if p < 0.5 else ''
            if args.dev > 2: print >> sys.stderr, "Accuracy: %s  %s\n" % (p,indicator)
            #if instance.sentID == 704:
            #    import pdb
            #    pdb.set_trace()
            pseudo_gold_amr.append(GraphState.get_parsed_amr(state.A))
            #assert set(state.A.tuples()) == set(instance.gold_graph.tuples())
        pt = n_correct_total/n_parsed_total if n_parsed_total != .0 else .0
        rt = n_correct_total/n_gold_total if n_gold_total !=.0 else .0
        ft = 2*pt*rt/(pt+rt) if pt+rt != .0 else .0
        write_parsed_amr(pseudo_gold_amr,train_instances,amr_file,'pseudo-gold')
        print "Total Accuracy: %s, Recall: %s, F-1: %s" % (pt,rt,ft)

        #amr_parser.record_actions('data/action_set.txt')
    elif args.mode == 'train': # actual parsing
        train_instances = preprocess(amr_file,False)
        if args.dev: dev_instances = preprocess(args.dev,False)
        feat_template = args.feat if args.feat else None
        model = Model(elog=experiment_log)
        model.setup(action_type=args.actionset,instances=train_instances,feature_templates_file=feat_template)
        #model.output_feature_generator()
        parser = Parser(model=model,action_type=args.actionset,verbose=args.verbose,elog=experiment_log)
        
        print >> experiment_log, "BEGIN TRAINING!"
        for iter in xrange(1,args.iterations+1):
            print >> experiment_log, "shuffling training instances"
            random.shuffle(train_instances)
            
            print >> experiment_log, "Iteration:",iter
            begin_updates = parser.perceptron.get_num_updates()
            parser.parse_corpus_train(train_instances)
            parser.perceptron.average_weight()
            #model.save_model(args.model+'-iter'+str(iter)+'-'+str(int(time.time()))+'.m')
            model.save_model(args.model+'-iter'+str(iter)+'.m')
            if args.dev:
                print >> experiment_log ,"Result on develop set:"                
                parsed_amr = parser.parse_corpus_test(dev_instances)
                write_parsed_amr(parsed_amr,dev_instances,args.dev)

        print >> experiment_log ,"DONE TRAINING!"
        
    elif args.mode == 'parse':        
        test_instances = preprocess(amr_file,False)

        model = Model.load_model(args.model)
        parser = Parser(model=model,action_type=args.actionset,verbose=args.verbose,elog=experiment_log)
        print >> experiment_log ,"BEGIN PARSING"
        results = parser.parse_corpus_test(test_instances)
        write_parsed_amr(results,test_instances,amr_file)
        print >> experiment_log ,"DONE PARSING"
        #pickle.dump(results,open('data/gold_edge_graph.pkl','wb'),pickle.HIGHEST_PROTOCOL)
        #plt.hist(results)
        #plt.savefig('result.png')
    else:
        arg_parser.print_help()